Skip to content

Commit ead27f8

Browse files
committed
bug: support lag for non-UTC datetime cursor fields
- Add `ensure_pendulum_datetime_non_utc` to parse datetime strings into non-UTC datetime objects. - Add `_datetime_obj_to_str` to preserve the colon in the timezone when converting datetime objects back to strings. - Skip writing back state if no valid rows are found for `last_value` in the transformer, which may otherwise cause incorrect behavior.
1 parent 5a1cb69 commit ead27f8

File tree

5 files changed

+114
-12
lines changed

5 files changed

+114
-12
lines changed

dlt/common/time.py

+37-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import contextlib
22
import datetime # noqa: I251
33
import re
4+
import sys
45
from typing import Any, Optional, Union, overload, TypeVar, Callable # noqa
56

67
from pendulum.parsing import (
@@ -125,6 +126,38 @@ def ensure_pendulum_datetime(value: TAnyDateTime) -> pendulum.DateTime:
125126
raise TypeError(f"Cannot coerce {value} to a pendulum.DateTime object.")
126127

127128

129+
def ensure_pendulum_datetime_non_utc(value: TAnyDateTime) -> pendulum.DateTime:
130+
if isinstance(value, datetime.datetime):
131+
ret = pendulum.instance(value)
132+
return ret
133+
elif isinstance(value, datetime.date):
134+
return pendulum.datetime(value.year, value.month, value.day)
135+
elif isinstance(value, (int, float, str)):
136+
result = _datetime_from_ts_or_iso(value)
137+
if isinstance(result, datetime.time):
138+
raise ValueError(f"Cannot coerce {value} to a pendulum.DateTime object.")
139+
if isinstance(result, pendulum.DateTime):
140+
return result
141+
return pendulum.datetime(result.year, result.month, result.day)
142+
raise TypeError(f"Cannot coerce {value} to a pendulum.DateTime object.")
143+
144+
145+
def datatime_obj_to_str(
146+
datatime: Union[datetime.datetime, datetime.date], datetime_format: str
147+
) -> str:
148+
if sys.version_info < (3, 12, 0) and "%:z" in datetime_format:
149+
modified_format = datetime_format.replace("%:z", "%z")
150+
datetime_str = datatime.strftime(modified_format)
151+
152+
timezone_part = datetime_str[-5:] if len(datetime_str) >= 5 else ""
153+
if timezone_part.startswith(("-", "+")):
154+
return f"{datetime_str[:-5]}{timezone_part[:3]}:{timezone_part[3:]}"
155+
156+
raise ValueError(f"Invalid timezone format in datetime string: {datetime_str}")
157+
158+
return datatime.strftime(datetime_format)
159+
160+
128161
def ensure_pendulum_time(value: Union[str, datetime.time]) -> pendulum.Time:
129162
"""Coerce a time value to a `pendulum.Time` object.
130163
@@ -164,27 +197,27 @@ def detect_datetime_format(value: str) -> Optional[str]:
164197
): "%Y-%m-%dT%H:%M:%S.%fZ", # UTC with fractional seconds
165198
re.compile(
166199
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2}$"
167-
): "%Y-%m-%dT%H:%M:%S%z", # Positive timezone offset
200+
): "%Y-%m-%dT%H:%M:%S%:z", # Positive timezone offset
168201
re.compile(
169202
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{4}$"
170203
): "%Y-%m-%dT%H:%M:%S%z", # Positive timezone without colon
171204
# Full datetime with fractional seconds and positive timezone offset
172205
re.compile(
173206
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+\d{2}:\d{2}$"
174-
): "%Y-%m-%dT%H:%M:%S.%f%z",
207+
): "%Y-%m-%dT%H:%M:%S.%f%:z",
175208
re.compile(
176209
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+\d{4}$"
177210
): "%Y-%m-%dT%H:%M:%S.%f%z", # Positive timezone without colon
178211
re.compile(
179212
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}-\d{2}:\d{2}$"
180-
): "%Y-%m-%dT%H:%M:%S%z", # Negative timezone offset
213+
): "%Y-%m-%dT%H:%M:%S%:z", # Negative timezone offset
181214
re.compile(
182215
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}-\d{4}$"
183216
): "%Y-%m-%dT%H:%M:%S%z", # Negative timezone without colon
184217
# Full datetime with fractional seconds and negative timezone offset
185218
re.compile(
186219
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+-\d{2}:\d{2}$"
187-
): "%Y-%m-%dT%H:%M:%S.%f%z",
220+
): "%Y-%m-%dT%H:%M:%S.%f%:z",
188221
re.compile(
189222
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+-\d{4}$"
190223
): "%Y-%m-%dT%H:%M:%S.%f%z", # Negative Timezone without colon

dlt/extract/incremental/__init__.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
from datetime import datetime # noqa: I251
33
from typing import Generic, ClassVar, Any, Optional, Type, Dict, Union, Literal, Tuple
4+
45
from typing_extensions import get_args
56

67
import inspect
@@ -560,8 +561,14 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]:
560561
else:
561562
rows = self._transform_item(transformer, rows)
562563

563-
# write back state
564+
# ensure last_value maintains forward-only progression when lag is applied
565+
if self.lag and (cached_last_value := self._cached_state.get("last_value")):
566+
transformer.last_value = self.last_value_func( # type: ignore[call-arg]
567+
(transformer.last_value, cached_last_value)
568+
)
569+
# writing back state
564570
self._cached_state["last_value"] = transformer.last_value
571+
565572
if not transformer.deduplication_disabled:
566573
# compute hashes for new last rows
567574
unique_hashes = set(

dlt/extract/incremental/lag.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
import sys
12
from datetime import datetime, timedelta, date # noqa: I251
23
from typing import Union
34

45
from dlt.common import logger
5-
from dlt.common.time import ensure_pendulum_datetime, detect_datetime_format
6+
from dlt.common.time import (
7+
detect_datetime_format,
8+
ensure_pendulum_datetime_non_utc,
9+
datatime_obj_to_str,
10+
)
611

712
from . import TCursorValue, LastValueFunc
813

@@ -17,12 +22,12 @@ def _apply_lag_to_value(
1722
is_str = isinstance(value, str)
1823
value_format = detect_datetime_format(value) if is_str else None
1924
is_str_date = value_format in ("%Y%m%d", "%Y-%m-%d") if value_format else None
20-
parsed_value = ensure_pendulum_datetime(value) if is_str else value
25+
parsed_value = ensure_pendulum_datetime_non_utc(value) if is_str else value
2126

2227
if isinstance(parsed_value, (datetime, date)):
2328
parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) # type: ignore[assignment]
2429
# go back to string or pass exact type
25-
value = parsed_value.strftime(value_format) if value_format else parsed_value # type: ignore[assignment]
30+
value = datatime_obj_to_str(parsed_value, value_format) if value_format else parsed_value # type: ignore[assignment]
2631

2732
elif isinstance(parsed_value, (int, float)):
2833
value = _apply_lag_to_number(lag, parsed_value, last_value_func) # type: ignore[assignment]

tests/common/test_time.py

+28-4
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
datetime_to_timestamp,
1313
datetime_to_timestamp_ms,
1414
detect_datetime_format,
15+
ensure_pendulum_datetime_non_utc,
1516
)
1617
from dlt.common.typing import TAnyDateTime
18+
from dlt.common.time import datatime_obj_to_str
1719

1820

1921
def test_timestamp_within() -> None:
@@ -132,21 +134,21 @@ def test_datetime_to_timestamp_helpers(
132134
[
133135
("2024-10-20T15:30:00Z", "%Y-%m-%dT%H:%M:%SZ"), # UTC 'Z'
134136
("2024-10-20T15:30:00.123456Z", "%Y-%m-%dT%H:%M:%S.%fZ"), # UTC 'Z' with fractional seconds
135-
("2024-10-20T15:30:00+02:00", "%Y-%m-%dT%H:%M:%S%z"), # Positive timezone offset
137+
("2024-10-20T15:30:00+02:00", "%Y-%m-%dT%H:%M:%S%:z"), # Positive timezone offset
136138
("2024-10-20T15:30:00+0200", "%Y-%m-%dT%H:%M:%S%z"), # Positive timezone offset (no colon)
137139
(
138140
"2024-10-20T15:30:00.123456+02:00",
139-
"%Y-%m-%dT%H:%M:%S.%f%z",
141+
"%Y-%m-%dT%H:%M:%S.%f%:z",
140142
), # Positive timezone offset with fractional seconds
141143
(
142144
"2024-10-20T15:30:00.123456+0200",
143145
"%Y-%m-%dT%H:%M:%S.%f%z",
144146
), # Positive timezone offset with fractional seconds (no colon)
145-
("2024-10-20T15:30:00-02:00", "%Y-%m-%dT%H:%M:%S%z"), # Negative timezone offset
147+
("2024-10-20T15:30:00-02:00", "%Y-%m-%dT%H:%M:%S%:z"), # Negative timezone offset
146148
("2024-10-20T15:30:00-0200", "%Y-%m-%dT%H:%M:%S%z"), # Negative timezone offset (no colon)
147149
(
148150
"2024-10-20T15:30:00.123456-02:00",
149-
"%Y-%m-%dT%H:%M:%S.%f%z",
151+
"%Y-%m-%dT%H:%M:%S.%f%:z",
150152
), # Negative timezone offset with fractional seconds
151153
(
152154
"2024-10-20T15:30:00.123456-0200",
@@ -170,6 +172,28 @@ def test_detect_datetime_format(value, expected_format) -> None:
170172
assert ensure_pendulum_datetime(value) is not None
171173

172174

175+
@pytest.mark.parametrize(
176+
"datetime_str, datetime_format, expected_value",
177+
[
178+
("2024-10-20T15:30:00+02:00", "%Y-%m-%dT%H:%M:%S%:z", "2024-10-20T15:30:00+02:00"),
179+
("2024-10-20T15:30:00+0200", "%Y-%m-%dT%H:%M:%S%z", "2024-10-20T15:30:00+0200"),
180+
(
181+
"2024-10-20T15:30:00.123456-02:00",
182+
"%Y-%m-%dT%H:%M:%S.%f%:z",
183+
"2024-10-20T15:30:00.123456-02:00",
184+
),
185+
(
186+
"2024-10-20T15:30:00.123456-0200",
187+
"%Y-%m-%dT%H:%M:%S.%f%z",
188+
"2024-10-20T15:30:00.123456-0200",
189+
),
190+
],
191+
)
192+
def test_datatime_obj_to_str(datetime_str, datetime_format, expected_value) -> None:
193+
datetime = ensure_pendulum_datetime_non_utc(datetime_str)
194+
assert datatime_obj_to_str(datetime, datetime_format) == expected_value
195+
196+
173197
@pytest.mark.parametrize(
174198
"value",
175199
[

tests/extract/test_incremental.py

+33
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,39 @@ def some_data(created_at=dlt.sources.incremental("created_at", initial_value)):
606606
assert s["last_value"] == initial_value + timedelta(minutes=4)
607607

608608

609+
@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS)
610+
def test_incremental_transform_return_empty_rows_with_lag(item_type: TestDataItemFormat) -> None:
611+
@dlt.resource
612+
def some_data(
613+
created_at=dlt.sources.incremental(
614+
"created_at", initial_value="2024-11-01T08:00:00+08:00", lag=3600
615+
)
616+
):
617+
yield from source_items
618+
619+
p = dlt.pipeline(pipeline_name=uniq_id())
620+
621+
first_run_data = [{"id": 1, "value": 10, "created_at": "2024-11-01T12:00:00+08:00"}]
622+
source_items = data_to_item_format(item_type, first_run_data)
623+
624+
p.extract(some_data())
625+
s = p.state["sources"][p.default_schema_name]["resources"]["some_data"]["incremental"][
626+
"created_at"
627+
]
628+
629+
assert s["last_value"] == "2024-11-01T12:00:00+08:00"
630+
631+
second_run_data = [{"id": 1, "value": 10, "created_at": "2024-11-01T10:00:00+08:00"}]
632+
source_items = data_to_item_format(item_type, second_run_data)
633+
634+
p.extract(some_data())
635+
s = p.state["sources"][p.default_schema_name]["resources"]["some_data"]["incremental"][
636+
"created_at"
637+
]
638+
639+
assert s["last_value"] == "2024-11-01T12:00:00+08:00"
640+
641+
609642
@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS)
610643
def test_descending_order_unique_hashes(item_type: TestDataItemFormat) -> None:
611644
"""Resource returns items in descending order but using `max` last value function.

0 commit comments

Comments
 (0)