Skip to content

fix: convert to datetime64[us] when TIMESTAMP or DATETIME is empty #858

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
10 changes: 5 additions & 5 deletions docs/reading.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,17 @@ Inferring the DataFrame's dtypes
The :func:`~pandas_gbq.read_gbq` method infers the pandas dtype for each
column, based on the BigQuery table schema.

================== =========================
================== ============================================
BigQuery Data Type dtype
================== =========================
================== ============================================
BOOL boolean
INT64 Int64
FLOAT64 float64
TIME dbtime
DATE dbdate or object
DATETIME datetime64[ns] or object
TIMESTAMP datetime64[ns, UTC] or object
================== =========================
DATETIME datetime64[ns] (datetime64[us] if pandas version >= 2.1.0) or object
TIMESTAMP datetime64[ns, UTC] (datetime64[us, UTC] if pandas version >= 2.1.0) or object
================== ============================================

If any DATE/DATETIME/TIMESTAMP value is outside of the range of `pandas.Timestamp.min
<https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.min.html>`__
Expand Down
9 changes: 9 additions & 0 deletions pandas_gbq/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,7 @@ def _finalize_dtypes(
"""
import db_dtypes
import pandas.api.types
import pandas

# If you update this mapping, also update the table at
# `docs/reading.rst`.
Expand All @@ -638,6 +639,14 @@ def _finalize_dtypes(
"DATETIME": "datetime64[ns]",
"TIMESTAMP": "datetime64[ns]",
}
if tuple(int(part) for part in pandas.__version__.split(".")[:2]) >= (2, 1):
# when pandas is 2.1.0 or later, default timestamp dtype is 'datetime64[us]'
# and we should use 'datetime64[us]' instead of 'datetime64[ns]'
dtype_map = {
"DATE": db_dtypes.DateDtype(),
"DATETIME": "datetime64[us]",
"TIMESTAMP": pandas.DatetimeTZDtype(unit="us", tz="UTC"),
}

for field in schema_fields:
# This method doesn't modify ARRAY/REPEATED columns.
Expand Down
2 changes: 2 additions & 0 deletions testing/constraints-3.10.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
numpy==1.26.4
pandas==2.0.3
1 change: 1 addition & 0 deletions testing/constraints-3.11.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pandas==2.1.4
4 changes: 2 additions & 2 deletions testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
numpy==1.19.4
pandas==1.1.4
numpy==1.20.3
pandas==1.5.3
71 changes: 71 additions & 0 deletions tests/unit/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
pytestmark = pytest.mark.filterwarnings("ignore:credentials from Google Cloud SDK")


PANDAS_VERSION = tuple(int(part) for part in pandas.__version__.split(".")[:2])


def _make_connector(project_id="some-project", **kwargs):
return gbq.GbqConnector(project_id, **kwargs)

Expand Down Expand Up @@ -113,6 +116,74 @@ def test__bqschema_to_nullsafe_dtypes(type_, expected):
assert result == {"x": expected}


@pytest.mark.parametrize(
("data", "schema_type", "expected"),
[
pytest.param(
pandas.to_datetime(["2017-01-01T12:00:00Z"]).astype(
pandas.DatetimeTZDtype(
# Microseconds aren't supported until newer pandas.
# https://github.com/googleapis/python-bigquery-pandas/issues/852
unit="us" if PANDAS_VERSION >= (2, 1) else "ns",
tz="UTC",
),
),
"TIMESTAMP",
pandas.DatetimeTZDtype(
# Microseconds aren't supported until newer pandas.
# https://github.com/googleapis/python-bigquery-pandas/issues/852
unit="us" if PANDAS_VERSION >= (2, 1) else "ns",
tz="UTC",
),
),
(
pandas.to_datetime([]).astype(object),
"TIMESTAMP",
pandas.DatetimeTZDtype(
# Microseconds aren't supported until newer pandas.
# https://github.com/googleapis/python-bigquery-pandas/issues/852
unit="us" if PANDAS_VERSION >= (2, 1) else "ns",
tz="UTC",
),
),
(
pandas.to_datetime(["2017-01-01T12:00:00"]).astype(
# Microseconds aren't supported until newer pandas.
# https://github.com/googleapis/python-bigquery-pandas/issues/852
"datetime64[us]"
if PANDAS_VERSION >= (2, 1)
else "datetime64[ns]",
),
"DATETIME",
numpy.dtype(
# Microseconds aren't supported until newer pandas.
# https://github.com/googleapis/python-bigquery-pandas/issues/852
"datetime64[us]"
if PANDAS_VERSION >= (2, 1)
else "datetime64[ns]",
),
),
(
pandas.to_datetime([]).astype(object),
"DATETIME",
numpy.dtype(
# Microseconds aren't supported until newer pandas.
# https://github.com/googleapis/python-bigquery-pandas/issues/852
"datetime64[us]"
if PANDAS_VERSION >= (2, 1)
else "datetime64[ns]",
),
),
],
)
def test__finalize_dtypes(data, schema_type, expected):
result = gbq._finalize_dtypes(
pandas.DataFrame(dict(x=data)),
[dict(name="x", type=schema_type, mode="NULLABLE")],
)
assert result["x"].dtype == expected


@pytest.mark.parametrize(
["query_or_table", "expected"],
[
Expand Down