1
1
import abc
2
2
import asyncio
3
+ import functools
3
4
from enum import Enum
4
5
from dataclasses import dataclass
5
6
from datetime import UTC , datetime , timedelta
54
55
55
56
56
57
class Triggers (Enum ):
57
- BACKFILL = ' BACKFILL'
58
+ BACKFILL = " BACKFILL"
58
59
59
60
60
61
class BaseDocument (BaseModel ):
@@ -114,13 +115,14 @@ def path(self) -> list[str]:
114
115
_ResourceConfig = TypeVar ("_ResourceConfig" , bound = ResourceConfig )
115
116
116
117
117
- CRON_REGEX = (r"^"
118
+ CRON_REGEX = (
119
+ r"^"
118
120
r"((?:[0-5]?\d(?:-[0-5]?\d)?|\*(?:/[0-5]?\d)?)(?:,(?:[0-5]?\d(?:-[0-5]?\d)?|\*(?:/[0-5]?\d)?))*)\s+" # minute
119
121
r"((?:[01]?\d|2[0-3]|(?:[01]?\d|2[0-3])-(?:[01]?\d|2[0-3])|\*(?:/[01]?\d|/2[0-3])?)(?:,(?:[01]?\d|2[0-3]|(?:[01]?\d|2[0-3])-(?:[01]?\d|2[0-3])|\*(?:/[01]?\d|/2[0-3])?))*)\s+" # hour
120
122
r"((?:0?[1-9]|[12]\d|3[01]|(?:0?[1-9]|[12]\d|3[01])-(?:0?[1-9]|[12]\d|3[01])|\*(?:/[0-9]|/1[0-9]|/2[0-9]|/3[01])?)(?:,(?:0?[1-9]|[12]\d|3[01]|(?:0?[1-9]|[12]\d|3[01])-(?:0?[1-9]|[12]\d|3[01])|\*(?:/[0-9]|/1[0-9]|/2[0-9]|/3[01])?))*)\s+" # day of month
121
123
r"((?:[1-9]|1[0-2]|(?:[1-9]|1[0-2])-(?:[1-9]|1[0-2])|\*(?:/[1-9]|/1[0-2])?)(?:,(?:[1-9]|1[0-2]|(?:[1-9]|1[0-2])-(?:[1-9]|1[0-2])|\*(?:/[1-9]|/1[0-2])?))*)\s+" # month
122
124
r"((?:[0-6]|(?:[0-6])-(?:[0-6])|\*(?:/[0-6])?)(?:,(?:[0-6]|(?:[0-6])-(?:[0-6])|\*(?:/[0-6])?))*)" # day of week
123
- r"$|^$" # Empty string to signify no schedule
125
+ r"$|^$" # Empty string to signify no schedule
124
126
)
125
127
126
128
@@ -129,7 +131,7 @@ class ResourceConfigWithSchedule(ResourceConfig):
129
131
default = "" ,
130
132
title = "Schedule" ,
131
133
description = "Schedule to automatically rebackfill this binding. Accepts a cron expression." ,
132
- pattern = CRON_REGEX
134
+ pattern = CRON_REGEX ,
133
135
)
134
136
135
137
@@ -164,8 +166,7 @@ class Backfill(BaseModel, extra="forbid"):
164
166
description = "LogCursor at which incremental replication began"
165
167
)
166
168
next_page : PageCursor = Field (
167
- description = "PageCursor of the next page to fetch" ,
168
- default = None
169
+ description = "PageCursor of the next page to fetch" , default = None
169
170
)
170
171
171
172
class Snapshot (BaseModel , extra = "forbid" ):
@@ -179,18 +180,20 @@ class Snapshot(BaseModel, extra="forbid"):
179
180
description = "The xxh3_128 hex digest of documents of this resource in the last snapshot"
180
181
)
181
182
182
- inc : Incremental | None = Field (
183
+ inc : Incremental | dict [ str , Incremental | None ] | None = Field (
183
184
default = None , description = "Incremental capture progress"
184
185
)
185
186
186
- backfill : Backfill | None = Field (
187
+ backfill : Backfill | dict [ str , Backfill | None ] | None = Field (
187
188
default = None ,
188
189
description = "Backfill progress, or None if no backfill is occurring" ,
189
190
)
190
191
191
192
snapshot : Snapshot | None = Field (default = None , description = "Snapshot progress" )
192
193
193
- last_initialized : datetime | None = Field (default = None , description = "The last time this state was initialized." )
194
+ last_initialized : datetime | None = Field (
195
+ default = None , description = "The last time this state was initialized."
196
+ )
194
197
195
198
196
199
_ResourceState = TypeVar ("_ResourceState" , bound = ResourceState )
@@ -213,6 +216,7 @@ class AssociatedDocument(Generic[_BaseDocument]):
213
216
You might use this if your data model requires you to load "child" documents when capturing a "parent" document,
214
217
instead of independently loading the child data stream.
215
218
"""
219
+
216
220
doc : _BaseDocument
217
221
binding : int
218
222
@@ -317,7 +321,7 @@ class FixedSchema:
317
321
CaptureBinding [_ResourceConfig ],
318
322
"Resource[_BaseDocument, _ResourceConfig, _ResourceState]" ,
319
323
]
320
- ]
324
+ ],
321
325
],
322
326
None ,
323
327
]
@@ -363,7 +367,6 @@ def resolve_bindings(
363
367
resources : list [Resource [Any , _BaseResourceConfig , Any ]],
364
368
resource_term = "Resource" ,
365
369
) -> list [tuple [_ResolvableBinding , Resource [Any , _BaseResourceConfig , Any ]]]:
366
-
367
370
resolved : list [
368
371
tuple [_ResolvableBinding , Resource [Any , _BaseResourceConfig , Any ]]
369
372
] = []
@@ -397,7 +400,6 @@ def validated(
397
400
]
398
401
],
399
402
) -> response .Validated :
400
-
401
403
return response .Validated (
402
404
bindings = [
403
405
response .ValidatedBinding (resourcePath = b [0 ].resourceConfig .path ())
@@ -415,7 +417,6 @@ def open(
415
417
]
416
418
],
417
419
) -> tuple [response .Opened , Callable [[Task ], Awaitable [None ]]]:
418
-
419
420
async def _run (task : Task ):
420
421
backfill_requests = []
421
422
if open .state .backfillRequests is not None :
@@ -445,17 +446,20 @@ async def _run(task: Task):
445
446
if state .last_initialized is None :
446
447
state .last_initialized = datetime .now (tz = UTC )
447
448
task .checkpoint (
448
- ConnectorState (
449
- bindingStateV1 = {binding .stateKey : state }
450
- )
449
+ ConnectorState (bindingStateV1 = {binding .stateKey : state })
451
450
)
452
451
453
452
if isinstance (binding .resourceConfig , ResourceConfigWithSchedule ):
454
453
cron_schedule = binding .resourceConfig .schedule
455
- next_scheduled_initialization = next_fire (cron_schedule , state .last_initialized )
454
+ next_scheduled_initialization = next_fire (
455
+ cron_schedule , state .last_initialized
456
+ )
456
457
457
- if next_scheduled_initialization and next_scheduled_initialization < datetime .now (tz = UTC ):
458
- # Re-initialize the binding if we missed a scheduled re-initialization.
458
+ if (
459
+ next_scheduled_initialization
460
+ and next_scheduled_initialization < datetime .now (tz = UTC )
461
+ ):
462
+ # Re-initialize the binding if we missed a scheduled re-initialization.
459
463
should_initialize = True
460
464
if state .backfill :
461
465
task .log .warning (
@@ -464,12 +468,22 @@ async def _run(task: Task):
464
468
" complete before the next scheduled backfill starts."
465
469
)
466
470
467
- next_scheduled_initialization = next_fire (cron_schedule , datetime .now (tz = UTC ))
471
+ next_scheduled_initialization = next_fire (
472
+ cron_schedule , datetime .now (tz = UTC )
473
+ )
468
474
469
- if next_scheduled_initialization and soonest_future_scheduled_initialization :
470
- soonest_future_scheduled_initialization = min (soonest_future_scheduled_initialization , next_scheduled_initialization )
475
+ if (
476
+ next_scheduled_initialization
477
+ and soonest_future_scheduled_initialization
478
+ ):
479
+ soonest_future_scheduled_initialization = min (
480
+ soonest_future_scheduled_initialization ,
481
+ next_scheduled_initialization ,
482
+ )
471
483
elif next_scheduled_initialization :
472
- soonest_future_scheduled_initialization = next_scheduled_initialization
484
+ soonest_future_scheduled_initialization = (
485
+ next_scheduled_initialization
486
+ )
473
487
474
488
if should_initialize :
475
489
# Checkpoint the binding's initialized state prior to any processing.
@@ -478,7 +492,7 @@ async def _run(task: Task):
478
492
479
493
task .checkpoint (
480
494
ConnectorState (
481
- bindingStateV1 = {binding .stateKey : state }
495
+ bindingStateV1 = {binding .stateKey : state },
482
496
)
483
497
)
484
498
@@ -487,7 +501,7 @@ async def _run(task: Task):
487
501
index ,
488
502
state ,
489
503
task ,
490
- resolved_bindings
504
+ resolved_bindings ,
491
505
)
492
506
493
507
async def scheduled_stop (future_dt : datetime | None ) -> None :
@@ -510,8 +524,12 @@ def open_binding(
510
524
binding_index : int ,
511
525
state : _ResourceState ,
512
526
task : Task ,
513
- fetch_changes : FetchChangesFn [_BaseDocument ] | None = None ,
514
- fetch_page : FetchPageFn [_BaseDocument ] | None = None ,
527
+ fetch_changes : FetchChangesFn [_BaseDocument ]
528
+ | dict [str , FetchChangesFn [_BaseDocument ]]
529
+ | None = None ,
530
+ fetch_page : FetchPageFn [_BaseDocument ]
531
+ | dict [str , FetchPageFn [_BaseDocument ]]
532
+ | None = None ,
515
533
fetch_snapshot : FetchSnapshotFn [_BaseDocument ] | None = None ,
516
534
tombstone : _BaseDocument | None = None ,
517
535
):
@@ -520,30 +538,96 @@ def open_binding(
520
538
521
539
It does 'heavy lifting' to actually capture a binding.
522
540
523
- TODO(johnny): Separate into snapshot vs incremental tasks?
541
+ When fetch_changes, fetch_page, or fetch_snapshot are provided as dictionaries,
542
+ each function will be run as a separate subtask with its own independent state.
543
+ The dictionary keys are used as subtask IDs and are used to store and retrieve
544
+ the state for each subtask in state.inc, state.backfill, or state.snapshot.
524
545
"""
525
546
526
547
prefix = "." .join (binding .resourceConfig .path ())
527
548
528
549
if fetch_changes :
529
550
530
- async def closure (task : Task ):
531
- assert state .inc
551
+ async def incremental_closure (
552
+ task : Task ,
553
+ fetch_changes : FetchChangesFn [_BaseDocument ],
554
+ state : ResourceState .Incremental ,
555
+ ):
556
+ assert state and not isinstance (state , dict )
532
557
await _binding_incremental_task (
533
- binding , binding_index , fetch_changes , state .inc , task ,
558
+ binding ,
559
+ binding_index ,
560
+ fetch_changes ,
561
+ state ,
562
+ task ,
534
563
)
535
564
536
- task .spawn_child (f"{ prefix } .incremental" , closure )
565
+ if isinstance (fetch_changes , dict ):
566
+ assert state .inc and isinstance (state .inc , dict )
567
+ for subtask_id , subtask_fetch_changes in fetch_changes .items ():
568
+ inc_state = state .inc .get (subtask_id )
569
+ assert inc_state
570
+
571
+ task .spawn_child (
572
+ f"{ prefix } .incremental.{ subtask_id } " ,
573
+ functools .partial (
574
+ incremental_closure ,
575
+ fetch_changes = subtask_fetch_changes ,
576
+ state = inc_state ,
577
+ ),
578
+ )
579
+ else :
580
+ assert state .inc and not isinstance (state .inc , dict )
581
+ task .spawn_child (
582
+ f"{ prefix } .incremental" ,
583
+ functools .partial (
584
+ incremental_closure ,
585
+ fetch_changes = fetch_changes ,
586
+ state = state .inc ,
587
+ ),
588
+ )
537
589
538
590
if fetch_page and state .backfill :
539
591
540
- async def closure (task : Task ):
541
- assert state .backfill
592
+ async def backfill_closure (
593
+ task : Task ,
594
+ fetch_page : FetchPageFn [_BaseDocument ],
595
+ state : ResourceState .Backfill ,
596
+ ):
597
+ assert state and not isinstance (state , dict )
542
598
await _binding_backfill_task (
543
- binding , binding_index , fetch_page , state .backfill , task ,
599
+ binding ,
600
+ binding_index ,
601
+ fetch_page ,
602
+ state ,
603
+ task ,
544
604
)
545
605
546
- task .spawn_child (f"{ prefix } .backfill" , closure )
606
+ if isinstance (fetch_page , dict ):
607
+ assert state .backfill and isinstance (state .backfill , dict )
608
+ for subtask_id , subtask_fetch_page in fetch_page .items ():
609
+ backfill_state = state .backfill .get (subtask_id )
610
+ assert backfill_state
611
+
612
+ task .spawn_child (
613
+ f"{ prefix } .backfill.{ subtask_id } " ,
614
+ functools .partial (
615
+ backfill_closure ,
616
+ fetch_page = subtask_fetch_page ,
617
+ state = backfill_state ,
618
+ ),
619
+ )
620
+
621
+ else :
622
+ assert state .backfill and not isinstance (state .backfill , dict )
623
+ task .spawn_child (
624
+ f"{ prefix } .backfill" ,
625
+ functools .partial (
626
+ backfill_closure ,
627
+ fetch_page = fetch_page ,
628
+ state = state .backfill ,
629
+ ),
630
+ )
547
631
548
632
if fetch_snapshot :
549
633
@@ -612,7 +696,7 @@ async def _binding_snapshot_task(
612
696
if isinstance (doc , dict ):
613
697
doc ["meta_" ] = {
614
698
"op" : "u" if count < state .last_count else "c" ,
615
- "row_id" : count
699
+ "row_id" : count ,
616
700
}
617
701
else :
618
702
doc .meta_ = BaseDocument .Meta (
@@ -719,7 +803,10 @@ async def _binding_incremental_task(
719
803
720
804
if lag < binding .resourceConfig .interval :
721
805
sleep_for = binding .resourceConfig .interval - lag
722
- task .log .info ("incremental task ran recently, sleeping until `interval` has fully elapsed" , {"sleep_for" : sleep_for , "interval" : binding .resourceConfig .interval })
806
+ task .log .info (
807
+ "incremental task ran recently, sleeping until `interval` has fully elapsed" ,
808
+ {"sleep_for" : sleep_for , "interval" : binding .resourceConfig .interval },
809
+ )
723
810
724
811
while True :
725
812
try :
@@ -747,9 +834,7 @@ async def _binding_incremental_task(
747
834
task .log .info ("incremental task triggered backfill" )
748
835
task .stopping .event .set ()
749
836
task .checkpoint (
750
- ConnectorState (
751
- backfillRequests = {binding .stateKey : True }
752
- )
837
+ ConnectorState (backfillRequests = {binding .stateKey : True })
753
838
)
754
839
return
755
840
else :
@@ -759,7 +844,12 @@ async def _binding_incremental_task(
759
844
is_larger = item > state .cursor
760
845
elif isinstance (item , datetime ) and isinstance (state .cursor , datetime ):
761
846
is_larger = item > state .cursor
762
- elif isinstance (item , tuple ) and isinstance (state .cursor , tuple ) and isinstance (item [0 ], str ) and isinstance (state .cursor [0 ], str ):
847
+ elif (
848
+ isinstance (item , tuple )
849
+ and isinstance (state .cursor , tuple )
850
+ and isinstance (item [0 ], str )
851
+ and isinstance (state .cursor [0 ], str )
852
+ ):
763
853
is_larger = item [0 ] > state .cursor [0 ]
764
854
else :
765
855
raise RuntimeError (
@@ -786,7 +876,7 @@ async def _binding_incremental_task(
786
876
sleep_for = binding .resourceConfig .interval
787
877
788
878
elif isinstance (state .cursor , datetime ):
789
- lag = ( datetime .now (tz = UTC ) - state .cursor )
879
+ lag = datetime .now (tz = UTC ) - state .cursor
790
880
791
881
if lag > binding .resourceConfig .interval :
792
882
# We're not idle. Attempt to fetch the next changes.
@@ -800,4 +890,6 @@ async def _binding_incremental_task(
800
890
sleep_for = timedelta ()
801
891
continue
802
892
803
- task .log .debug ("incremental task is idle" , {"sleep_for" : sleep_for , "cursor" : state .cursor })
893
+ task .log .debug (
894
+ "incremental task is idle" , {"sleep_for" : sleep_for , "cursor" : state .cursor }
895
+ )
0 commit comments