Skip to content

Commit 99b8e9e

Browse files
authored
Merge pull request #25 from forcedotcom/W-18771596-fix-pickle-exception
Fix pickle exception
2 parents 9e3da9c + d6bd359 commit 99b8e9e

File tree

2 files changed

+56
-3
lines changed

2 files changed

+56
-3
lines changed

src/datacustomcode/io/reader/query_api.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
Union,
2222
)
2323

24+
import pandas.api.types as pd_types
2425
from pyspark.sql.types import (
2526
BooleanType,
2627
DoubleType,
@@ -48,8 +49,6 @@
4849
"object": StringType(),
4950
"int64": LongType(),
5051
"float64": DoubleType(),
51-
"datetime64[ns]": TimestampType(),
52-
"datetime64[ns, UTC]": TimestampType(),
5352
"bool": BooleanType(),
5453
}
5554

@@ -59,7 +58,11 @@ def _pandas_to_spark_schema(
5958
) -> StructType:
6059
fields = []
6160
for column, dtype in pandas_df.dtypes.items():
62-
spark_type = PANDAS_TYPE_MAPPING.get(str(dtype), StringType())
61+
spark_type: AtomicType
62+
if pd_types.is_datetime64_any_dtype(dtype):
63+
spark_type = TimestampType()
64+
else:
65+
spark_type = PANDAS_TYPE_MAPPING.get(str(dtype), StringType())
6366
fields.append(StructField(column, spark_type, nullable))
6467
return StructType(fields)
6568

tests/io/reader/test_query_api.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
StringType,
1515
StructField,
1616
StructType,
17+
TimestampType,
1718
)
1819
import pytest
1920

@@ -59,6 +60,55 @@ def test_pandas_to_spark_schema_nullable(self):
5960
schema = _pandas_to_spark_schema(df, nullable=False)
6061
assert not schema.fields[0].nullable
6162

63+
def test_pandas_to_spark_schema_datetime_types(self):
64+
"""Test conversion of pandas datetime types to Spark TimestampType."""
65+
66+
# Create test data with different datetime types
67+
data = {
68+
"datetime_ns": pd.to_datetime(
69+
["2023-01-01 10:00:00", "2023-01-02 11:00:00"]
70+
),
71+
"datetime_ns_utc": pd.to_datetime(
72+
["2023-01-01 10:00:00", "2023-01-02 11:00:00"], utc=True
73+
),
74+
"datetime_ms": pd.to_datetime(
75+
["2023-01-01 10:00:00", "2023-01-02 11:00:00"]
76+
).astype("datetime64[ms]"),
77+
"datetime_ms_utc": pd.to_datetime(
78+
["2023-01-01 10:00:00", "2023-01-02 11:00:00"], utc=True
79+
)
80+
.tz_localize(None)
81+
.astype("datetime64[ms]"),
82+
}
83+
df = pd.DataFrame(data)
84+
85+
# Convert to Spark schema
86+
schema = _pandas_to_spark_schema(df)
87+
88+
# Verify the schema
89+
assert isinstance(schema, StructType)
90+
assert len(schema.fields) == 4
91+
92+
# Check that all datetime columns map to TimestampType
93+
field_dict = {field.name: field for field in schema.fields}
94+
for field_name in [
95+
"datetime_ns",
96+
"datetime_ns_utc",
97+
"datetime_ms",
98+
"datetime_ms_utc",
99+
]:
100+
assert isinstance(field_dict[field_name].dataType, TimestampType), (
101+
f"Field {field_name} should be TimestampType, "
102+
f"got {type(field_dict[field_name].dataType)}"
103+
)
104+
assert field_dict[field_name].nullable
105+
106+
# Verify the actual pandas dtypes to ensure our test data has the expected types
107+
assert str(df["datetime_ns"].dtype) == "datetime64[ns]"
108+
assert str(df["datetime_ns_utc"].dtype) == "datetime64[ns, UTC]"
109+
assert str(df["datetime_ms"].dtype) == "datetime64[ms]"
110+
assert str(df["datetime_ms_utc"].dtype) == "datetime64[ms]"
111+
62112

63113
# Completely isolated test class for QueryAPIDataCloudReader
64114
@pytest.mark.usefixtures("patch_all_requests")

0 commit comments

Comments
 (0)