1
1
import abc
2
2
import asyncio
3
+ import functools
3
4
from enum import Enum
4
5
from dataclasses import dataclass
5
6
from datetime import UTC , datetime , timedelta
54
55
55
56
56
57
class Triggers (Enum ):
57
- BACKFILL = ' BACKFILL'
58
+ BACKFILL = " BACKFILL"
58
59
59
60
60
61
class BaseDocument (BaseModel ):
@@ -114,13 +115,14 @@ def path(self) -> list[str]:
114
115
_ResourceConfig = TypeVar ("_ResourceConfig" , bound = ResourceConfig )
115
116
116
117
117
- CRON_REGEX = (r"^"
118
+ CRON_REGEX = (
119
+ r"^"
118
120
r"((?:[0-5]?\d(?:-[0-5]?\d)?|\*(?:/[0-5]?\d)?)(?:,(?:[0-5]?\d(?:-[0-5]?\d)?|\*(?:/[0-5]?\d)?))*)\s+" # minute
119
121
r"((?:[01]?\d|2[0-3]|(?:[01]?\d|2[0-3])-(?:[01]?\d|2[0-3])|\*(?:/[01]?\d|/2[0-3])?)(?:,(?:[01]?\d|2[0-3]|(?:[01]?\d|2[0-3])-(?:[01]?\d|2[0-3])|\*(?:/[01]?\d|/2[0-3])?))*)\s+" # hour
120
122
r"((?:0?[1-9]|[12]\d|3[01]|(?:0?[1-9]|[12]\d|3[01])-(?:0?[1-9]|[12]\d|3[01])|\*(?:/[0-9]|/1[0-9]|/2[0-9]|/3[01])?)(?:,(?:0?[1-9]|[12]\d|3[01]|(?:0?[1-9]|[12]\d|3[01])-(?:0?[1-9]|[12]\d|3[01])|\*(?:/[0-9]|/1[0-9]|/2[0-9]|/3[01])?))*)\s+" # day of month
121
123
r"((?:[1-9]|1[0-2]|(?:[1-9]|1[0-2])-(?:[1-9]|1[0-2])|\*(?:/[1-9]|/1[0-2])?)(?:,(?:[1-9]|1[0-2]|(?:[1-9]|1[0-2])-(?:[1-9]|1[0-2])|\*(?:/[1-9]|/1[0-2])?))*)\s+" # month
122
124
r"((?:[0-6]|(?:[0-6])-(?:[0-6])|\*(?:/[0-6])?)(?:,(?:[0-6]|(?:[0-6])-(?:[0-6])|\*(?:/[0-6])?))*)" # day of week
123
- r"$|^$" # Empty string to signify no schedule
125
+ r"$|^$" # Empty string to signify no schedule
124
126
)
125
127
126
128
@@ -129,7 +131,7 @@ class ResourceConfigWithSchedule(ResourceConfig):
129
131
default = "" ,
130
132
title = "Schedule" ,
131
133
description = "Schedule to automatically rebackfill this binding. Accepts a cron expression." ,
132
- pattern = CRON_REGEX
134
+ pattern = CRON_REGEX ,
133
135
)
134
136
135
137
@@ -164,8 +166,7 @@ class Backfill(BaseModel, extra="forbid"):
164
166
description = "LogCursor at which incremental replication began"
165
167
)
166
168
next_page : PageCursor = Field (
167
- description = "PageCursor of the next page to fetch" ,
168
- default = None
169
+ description = "PageCursor of the next page to fetch" , default = None
169
170
)
170
171
171
172
class Snapshot (BaseModel , extra = "forbid" ):
@@ -179,18 +180,20 @@ class Snapshot(BaseModel, extra="forbid"):
179
180
description = "The xxh3_128 hex digest of documents of this resource in the last snapshot"
180
181
)
181
182
182
- inc : Incremental | None = Field (
183
+ inc : Incremental | dict [ str , Incremental | None ] | None = Field (
183
184
default = None , description = "Incremental capture progress"
184
185
)
185
186
186
- backfill : Backfill | None = Field (
187
+ backfill : Backfill | dict [ str , Backfill | None ] | None = Field (
187
188
default = None ,
188
189
description = "Backfill progress, or None if no backfill is occurring" ,
189
190
)
190
191
191
192
snapshot : Snapshot | None = Field (default = None , description = "Snapshot progress" )
192
193
193
- last_initialized : datetime | None = Field (default = None , description = "The last time this state was initialized." )
194
+ last_initialized : datetime | None = Field (
195
+ default = None , description = "The last time this state was initialized."
196
+ )
194
197
195
198
196
199
_ResourceState = TypeVar ("_ResourceState" , bound = ResourceState )
@@ -213,6 +216,7 @@ class AssociatedDocument(Generic[_BaseDocument]):
213
216
You might use this if your data model requires you to load "child" documents when capturing a "parent" document,
214
217
instead of independently loading the child data stream.
215
218
"""
219
+
216
220
doc : _BaseDocument
217
221
binding : int
218
222
@@ -317,7 +321,7 @@ class FixedSchema:
317
321
CaptureBinding [_ResourceConfig ],
318
322
"Resource[_BaseDocument, _ResourceConfig, _ResourceState]" ,
319
323
]
320
- ]
324
+ ],
321
325
],
322
326
None ,
323
327
]
@@ -363,7 +367,6 @@ def resolve_bindings(
363
367
resources : list [Resource [Any , _BaseResourceConfig , Any ]],
364
368
resource_term = "Resource" ,
365
369
) -> list [tuple [_ResolvableBinding , Resource [Any , _BaseResourceConfig , Any ]]]:
366
-
367
370
resolved : list [
368
371
tuple [_ResolvableBinding , Resource [Any , _BaseResourceConfig , Any ]]
369
372
] = []
@@ -397,7 +400,6 @@ def validated(
397
400
]
398
401
],
399
402
) -> response .Validated :
400
-
401
403
return response .Validated (
402
404
bindings = [
403
405
response .ValidatedBinding (resourcePath = b [0 ].resourceConfig .path ())
@@ -415,7 +417,6 @@ def open(
415
417
]
416
418
],
417
419
) -> tuple [response .Opened , Callable [[Task ], Awaitable [None ]]]:
418
-
419
420
async def _run (task : Task ):
420
421
backfill_requests = []
421
422
if open .state .backfillRequests is not None :
@@ -445,17 +446,20 @@ async def _run(task: Task):
445
446
if state .last_initialized is None :
446
447
state .last_initialized = datetime .now (tz = UTC )
447
448
task .checkpoint (
448
- ConnectorState (
449
- bindingStateV1 = {binding .stateKey : state }
450
- )
449
+ ConnectorState (bindingStateV1 = {binding .stateKey : state })
451
450
)
452
451
453
452
if isinstance (binding .resourceConfig , ResourceConfigWithSchedule ):
454
453
cron_schedule = binding .resourceConfig .schedule
455
- next_scheduled_initialization = next_fire (cron_schedule , state .last_initialized )
454
+ next_scheduled_initialization = next_fire (
455
+ cron_schedule , state .last_initialized
456
+ )
456
457
457
- if next_scheduled_initialization and next_scheduled_initialization < datetime .now (tz = UTC ):
458
- # Re-initialize the binding if we missed a scheduled re-initialization.
458
+ if (
459
+ next_scheduled_initialization
460
+ and next_scheduled_initialization < datetime .now (tz = UTC )
461
+ ):
462
+ # Re-initialize the binding if we missed a scheduled re-initialization.
459
463
should_initialize = True
460
464
if state .backfill :
461
465
task .log .warning (
@@ -464,12 +468,22 @@ async def _run(task: Task):
464
468
" complete before the next scheduled backfill starts."
465
469
)
466
470
467
- next_scheduled_initialization = next_fire (cron_schedule , datetime .now (tz = UTC ))
471
+ next_scheduled_initialization = next_fire (
472
+ cron_schedule , datetime .now (tz = UTC )
473
+ )
468
474
469
- if next_scheduled_initialization and soonest_future_scheduled_initialization :
470
- soonest_future_scheduled_initialization = min (soonest_future_scheduled_initialization , next_scheduled_initialization )
475
+ if (
476
+ next_scheduled_initialization
477
+ and soonest_future_scheduled_initialization
478
+ ):
479
+ soonest_future_scheduled_initialization = min (
480
+ soonest_future_scheduled_initialization ,
481
+ next_scheduled_initialization ,
482
+ )
471
483
elif next_scheduled_initialization :
472
- soonest_future_scheduled_initialization = next_scheduled_initialization
484
+ soonest_future_scheduled_initialization = (
485
+ next_scheduled_initialization
486
+ )
473
487
474
488
if should_initialize :
475
489
# Checkpoint the binding's initialized state prior to any processing.
@@ -478,7 +492,7 @@ async def _run(task: Task):
478
492
479
493
task .checkpoint (
480
494
ConnectorState (
481
- bindingStateV1 = {binding .stateKey : state }
495
+ bindingStateV1 = {binding .stateKey : state },
482
496
)
483
497
)
484
498
@@ -487,7 +501,7 @@ async def _run(task: Task):
487
501
index ,
488
502
state ,
489
503
task ,
490
- resolved_bindings
504
+ resolved_bindings ,
491
505
)
492
506
493
507
async def scheduled_stop (future_dt : datetime | None ) -> None :
@@ -510,8 +524,12 @@ def open_binding(
510
524
binding_index : int ,
511
525
state : _ResourceState ,
512
526
task : Task ,
513
- fetch_changes : FetchChangesFn [_BaseDocument ] | None = None ,
514
- fetch_page : FetchPageFn [_BaseDocument ] | None = None ,
527
+ fetch_changes : FetchChangesFn [_BaseDocument ]
528
+ | dict [str , FetchChangesFn [_BaseDocument ]]
529
+ | None = None ,
530
+ fetch_page : FetchPageFn [_BaseDocument ]
531
+ | dict [str , FetchPageFn [_BaseDocument ]]
532
+ | None = None ,
515
533
fetch_snapshot : FetchSnapshotFn [_BaseDocument ] | None = None ,
516
534
tombstone : _BaseDocument | None = None ,
517
535
):
@@ -520,30 +538,92 @@ def open_binding(
520
538
521
539
It does 'heavy lifting' to actually capture a binding.
522
540
523
- TODO(johnny): Separate into snapshot vs incremental tasks?
541
+ When fetch_changes, fetch_page, or fetch_snapshot are provided as dictionaries,
542
+ each function will be run as a separate subtask with its own independent state.
543
+ The dictionary keys are used as subtask IDs and are used to store and retrieve
544
+ the state for each subtask in state.inc, state.backfill, or state.snapshot.
524
545
"""
525
546
526
547
prefix = "." .join (binding .resourceConfig .path ())
527
548
528
549
if fetch_changes :
529
550
530
- async def closure (task : Task ):
531
- assert state .inc
551
+ async def closure (
552
+ task : Task ,
553
+ fetch_changes : FetchChangesFn [_BaseDocument ],
554
+ state : ResourceState .Incremental ,
555
+ ):
556
+ assert state and not isinstance (state , dict )
532
557
await _binding_incremental_task (
533
- binding , binding_index , fetch_changes , state .inc , task ,
558
+ binding ,
559
+ binding_index ,
560
+ fetch_changes ,
561
+ state ,
562
+ task ,
534
563
)
535
564
536
- task .spawn_child (f"{ prefix } .incremental" , closure )
565
+ if isinstance (fetch_changes , dict ):
566
+ for subtask_id , subtask_fetch_changes in fetch_changes .items ():
567
+ inc_state = state .inc .get (subtask_id )
568
+ assert inc_state
569
+
570
+ task .spawn_child (
571
+ f"{ prefix } .incremental.{ subtask_id } " ,
572
+ functools .partial (
573
+ closure ,
574
+ fetch_changes = subtask_fetch_changes ,
575
+ state = inc_state ,
576
+ ),
577
+ )
578
+ else :
579
+ task .spawn_child (
580
+ f"{ prefix } .incremental" ,
581
+ functools .partial (
582
+ closure ,
583
+ fetch_changes = fetch_changes ,
584
+ state = state .inc ,
585
+ ),
586
+ )
537
587
538
588
if fetch_page and state .backfill :
539
589
540
- async def closure (task : Task ):
541
- assert state .backfill
590
+ async def closure (
591
+ task : Task ,
592
+ fetch_page : FetchPageFn [_BaseDocument ],
593
+ state : ResourceState .Backfill ,
594
+ ):
595
+ assert state and not isinstance (state , dict )
542
596
await _binding_backfill_task (
543
- binding , binding_index , fetch_page , state .backfill , task ,
597
+ binding ,
598
+ binding_index ,
599
+ fetch_page ,
600
+ state ,
601
+ task ,
544
602
)
545
603
546
- task .spawn_child (f"{ prefix } .backfill" , closure )
604
+ if isinstance (fetch_page , dict ):
605
+ for subtask_id , subtask_fetch_page in fetch_page .items ():
606
+ backfill_state = state .backfill .get (subtask_id )
607
+ assert backfill_state
608
+
609
+ task .spawn_child (
610
+ f"{ prefix } .backfill.{ subtask_id } " ,
611
+ functools .partial (
612
+ closure ,
613
+ fetch_page = subtask_fetch_page ,
614
+ state = backfill_state ,
615
+ ),
616
+ )
617
+
618
+ else :
619
+ task .spawn_child (
620
+ f"{ prefix } .backfill" ,
621
+ functools .partial (
622
+ closure ,
623
+ fetch_page = fetch_page ,
624
+ state = state .backfill ,
625
+ ),
626
+ )
547
627
548
628
if fetch_snapshot :
549
629
@@ -612,7 +692,7 @@ async def _binding_snapshot_task(
612
692
if isinstance (doc , dict ):
613
693
doc ["meta_" ] = {
614
694
"op" : "u" if count < state .last_count else "c" ,
615
- "row_id" : count
695
+ "row_id" : count ,
616
696
}
617
697
else :
618
698
doc .meta_ = BaseDocument .Meta (
@@ -719,7 +799,10 @@ async def _binding_incremental_task(
719
799
720
800
if lag < binding .resourceConfig .interval :
721
801
sleep_for = binding .resourceConfig .interval - lag
722
- task .log .info ("incremental task ran recently, sleeping until `interval` has fully elapsed" , {"sleep_for" : sleep_for , "interval" : binding .resourceConfig .interval })
802
+ task .log .info (
803
+ "incremental task ran recently, sleeping until `interval` has fully elapsed" ,
804
+ {"sleep_for" : sleep_for , "interval" : binding .resourceConfig .interval },
805
+ )
723
806
724
807
while True :
725
808
try :
@@ -747,9 +830,7 @@ async def _binding_incremental_task(
747
830
task .log .info ("incremental task triggered backfill" )
748
831
task .stopping .event .set ()
749
832
task .checkpoint (
750
- ConnectorState (
751
- backfillRequests = {binding .stateKey : True }
752
- )
833
+ ConnectorState (backfillRequests = {binding .stateKey : True })
753
834
)
754
835
return
755
836
else :
@@ -759,7 +840,12 @@ async def _binding_incremental_task(
759
840
is_larger = item > state .cursor
760
841
elif isinstance (item , datetime ) and isinstance (state .cursor , datetime ):
761
842
is_larger = item > state .cursor
762
- elif isinstance (item , tuple ) and isinstance (state .cursor , tuple ) and isinstance (item [0 ], str ) and isinstance (state .cursor [0 ], str ):
843
+ elif (
844
+ isinstance (item , tuple )
845
+ and isinstance (state .cursor , tuple )
846
+ and isinstance (item [0 ], str )
847
+ and isinstance (state .cursor [0 ], str )
848
+ ):
763
849
is_larger = item [0 ] > state .cursor [0 ]
764
850
else :
765
851
raise RuntimeError (
@@ -786,7 +872,7 @@ async def _binding_incremental_task(
786
872
sleep_for = binding .resourceConfig .interval
787
873
788
874
elif isinstance (state .cursor , datetime ):
789
- lag = ( datetime .now (tz = UTC ) - state .cursor )
875
+ lag = datetime .now (tz = UTC ) - state .cursor
790
876
791
877
if lag > binding .resourceConfig .interval :
792
878
# We're not idle. Attempt to fetch the next changes.
@@ -800,4 +886,6 @@ async def _binding_incremental_task(
800
886
sleep_for = timedelta ()
801
887
continue
802
888
803
- task .log .debug ("incremental task is idle" , {"sleep_for" : sleep_for , "cursor" : state .cursor })
889
+ task .log .debug (
890
+ "incremental task is idle" , {"sleep_for" : sleep_for , "cursor" : state .cursor }
891
+ )
0 commit comments