Skip to content

fix: convert to datetime64[us] when TIMESTAMP or DATETIME is empty #858

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
10 changes: 5 additions & 5 deletions docs/reading.rst
Original file line number Diff line number Diff line change
@@ -56,17 +56,17 @@ Inferring the DataFrame's dtypes
The :func:`~pandas_gbq.read_gbq` method infers the pandas dtype for each
column, based on the BigQuery table schema.

================== =========================
================== ============================================
BigQuery Data Type dtype
================== =========================
================== ============================================
BOOL boolean
INT64 Int64
FLOAT64 float64
TIME dbtime
DATE dbdate or object
DATETIME datetime64[ns] or object
TIMESTAMP datetime64[ns, UTC] or object
================== =========================
DATETIME datetime64[ns] (datetime64[us] if pandas version >= 2.1.0) or object
TIMESTAMP datetime64[ns, UTC] (datetime64[us, UTC] if pandas version >= 2.1.0) or object
================== ============================================

If any DATE/DATETIME/TIMESTAMP value is outside of the range of `pandas.Timestamp.min
<https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.min.html>`__
8 changes: 8 additions & 0 deletions pandas_gbq/features.py
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@
BIGQUERY_QUERY_AND_WAIT_VERSION = "3.14.0"
PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0"
PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0"
PANDAS_MICROSECTONDS_DATETIME_VERSION = "2.1.0"


class Features:
@@ -81,5 +82,12 @@ def pandas_has_boolean_dtype(self):
desired_version = packaging.version.parse(PANDAS_BOOLEAN_DTYPE_VERSION)
return self.pandas_installed_version >= desired_version

@property
def pandas_has_microseconds_datetime(self):
import packaging.version

desired_version = packaging.version.parse(PANDAS_MICROSECTONDS_DATETIME_VERSION)
return self.pandas_installed_version >= desired_version


FEATURES = Features()
9 changes: 9 additions & 0 deletions pandas_gbq/gbq.py
Original file line number Diff line number Diff line change
@@ -630,6 +630,7 @@ def _finalize_dtypes(
"""
import db_dtypes
import pandas.api.types
import pandas

# If you update this mapping, also update the table at
# `docs/reading.rst`.
@@ -638,6 +639,14 @@ def _finalize_dtypes(
"DATETIME": "datetime64[ns]",
"TIMESTAMP": "datetime64[ns]",
}
if FEATURES.pandas_has_microseconds_datetime:
# when pandas is 2.1.0 or later, default timestamp dtype is 'datetime64[us]'
# and we should use 'datetime64[us]' instead of 'datetime64[ns]'
dtype_map = {
"DATE": db_dtypes.DateDtype(),
"DATETIME": "datetime64[us]",
"TIMESTAMP": pandas.DatetimeTZDtype(unit="us", tz="UTC"),
}

for field in schema_fields:
# This method doesn't modify ARRAY/REPEATED columns.
2 changes: 2 additions & 0 deletions testing/constraints-3.10.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
numpy==1.26.4
pandas==2.0.3
1 change: 1 addition & 0 deletions testing/constraints-3.11.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pandas==2.1.4
4 changes: 2 additions & 2 deletions testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
numpy==1.19.4
pandas==1.1.4
numpy==1.20.3
pandas==1.5.3
11 changes: 8 additions & 3 deletions tests/system/test_read_gbq.py
Original file line number Diff line number Diff line change
@@ -16,6 +16,7 @@

from pandas_gbq.features import FEATURES


QueryTestCase = collections.namedtuple(
"QueryTestCase",
["query", "expected", "use_bqstorage_apis"],
@@ -628,7 +629,9 @@ def test_empty_dataframe(read_gbq, use_bqstorage_api):
),
"datetime_col": pandas.Series(
[],
dtype="datetime64[ns]",
dtype="datetime64[us]"
if FEATURES.pandas_has_microseconds_datetime
else "datetime64[ns]",
),
"float_col": pandas.Series([], dtype="float64"),
"int64_col": pandas.Series([], dtype="Int64"),
@@ -640,8 +643,10 @@ def test_empty_dataframe(read_gbq, use_bqstorage_api):
),
"timestamp_col": pandas.Series(
[],
dtype="datetime64[ns]",
).dt.tz_localize(datetime.timezone.utc),
dtype=pandas.DatetimeTZDtype(unit="us", tz="UTC")
if FEATURES.pandas_has_microseconds_datetime
else pandas.DatetimeTZDtype(tz="UTC"),
),
}
)
result = read_gbq(query, use_bqstorage_api=use_bqstorage_api)
49 changes: 30 additions & 19 deletions tests/system/test_to_gbq.py
Original file line number Diff line number Diff line change
@@ -16,6 +16,9 @@
pytest.importorskip("google.cloud.bigquery", minversion="1.24.0")


PANDAS_VERSION = tuple(int(part) for part in pandas.__version__.split(".")[:2])


@pytest.fixture(params=["load_parquet", "load_csv"])
def api_method(request):
return request.param
@@ -343,25 +346,33 @@ def test_series_round_trip(
# require `date_as_object` parameter in
# google-cloud-bigquery versions 1.x and 2.x, but not 3.x.
# https://github.com/googleapis/python-bigquery-pandas/issues/365
"datetime_col": [
datetime.datetime(1, 1, 1),
datetime.datetime(1970, 1, 1),
datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
],
"timestamp_col": [
datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc),
datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc),
datetime.datetime(
9999,
12,
31,
23,
59,
59,
999999,
tzinfo=datetime.timezone.utc,
),
],
"datetime_col": pandas.Series(
[
datetime.datetime(1, 1, 1),
datetime.datetime(1970, 1, 1),
datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
],
dtype="object" if PANDAS_VERSION < (2, 1) else "datetime64[us]",
),
"timestamp_col": pandas.Series(
[
datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc),
datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc),
datetime.datetime(
9999,
12,
31,
23,
59,
59,
999999,
tzinfo=datetime.timezone.utc,
),
],
dtype="object"
if PANDAS_VERSION < (2, 1)
else pandas.DatetimeTZDtype(unit="us", tz="UTC"),
),
},
columns=["row_num", "date_col", "datetime_col", "timestamp_col"],
),
56 changes: 56 additions & 0 deletions tests/unit/test_gbq.py
Original file line number Diff line number Diff line change
@@ -113,6 +113,62 @@ def test__bqschema_to_nullsafe_dtypes(type_, expected):
assert result == {"x": expected}


@pytest.mark.parametrize(
("data", "schema_type", "expected"),
[
pytest.param(
pandas.to_datetime(["2017-01-01T12:00:00Z"]).astype(
pandas.DatetimeTZDtype(
unit="us" if FEATURES.pandas_has_microseconds_datetime else "ns",
tz="UTC",
),
),
"TIMESTAMP",
pandas.DatetimeTZDtype(
unit="us" if FEATURES.pandas_has_microseconds_datetime else "ns",
tz="UTC",
),
),
(
pandas.to_datetime([]).astype(object),
"TIMESTAMP",
pandas.DatetimeTZDtype(
unit="us" if FEATURES.pandas_has_microseconds_datetime else "ns",
tz="UTC",
),
),
(
pandas.to_datetime(["2017-01-01T12:00:00"]).astype(
"datetime64[us]"
if FEATURES.pandas_has_microseconds_datetime
else "datetime64[ns]",
),
"DATETIME",
numpy.dtype(
"datetime64[us]"
if FEATURES.pandas_has_microseconds_datetime
else "datetime64[ns]",
),
),
(
pandas.to_datetime([]).astype(object),
"DATETIME",
numpy.dtype(
"datetime64[us]"
if FEATURES.pandas_has_microseconds_datetime
else "datetime64[ns]",
),
),
],
)
def test__finalize_dtypes(data, schema_type, expected):
result = gbq._finalize_dtypes(
pandas.DataFrame(dict(x=data)),
[dict(name="x", type=schema_type, mode="NULLABLE")],
)
assert result["x"].dtype == expected


@pytest.mark.parametrize(
["query_or_table", "expected"],
[