@@ -155,6 +155,7 @@ func importDataCommandFn(cmd *cobra.Command, args []string) {
155
155
// TODO: handle case-sensitive in table names with oracle ff-db
156
156
// quoteTableNameIfRequired()
157
157
importFileTasks := discoverFilesToImport ()
158
+ log .Debugf ("Discovered import file tasks: %v" , importFileTasks )
158
159
if importerRole == TARGET_DB_IMPORTER_ROLE {
159
160
160
161
importType = record .ExportType
@@ -316,6 +317,10 @@ type ImportFileTask struct {
316
317
FileSize int64
317
318
}
318
319
320
+ func (task * ImportFileTask ) String () string {
321
+ return fmt .Sprintf ("{ID: %d, FilePath: %s, TableName: %s, RowCount: %d, FileSize: %d}" , task .ID , task .FilePath , task .TableNameTup .ForOutput (), task .RowCount , task .FileSize )
322
+ }
323
+
319
324
// func quoteTableNameIfRequired() {
320
325
// if tconf.TargetDBType != ORACLE {
321
326
// return
@@ -543,6 +548,8 @@ func importData(importFileTasks []*ImportFileTask) {
543
548
utils .ErrExit ("Failed to classify tasks: %s" , err )
544
549
}
545
550
}
551
+ log .Infof ("pending tasks: %v" , pendingTasks )
552
+ log .Infof ("completed tasks: %v" , completedTasks )
546
553
547
554
//TODO: BUG: we are applying table-list filter on importFileTasks, but here we are considering all tables as per
548
555
// export-data table-list. Should be fine because we are only disabling and re-enabling, but this is still not ideal.
@@ -584,28 +591,36 @@ func importData(importFileTasks []*ImportFileTask) {
584
591
controlPlane .UpdateImportedRowCount (importDataAllTableMetrics )
585
592
}
586
593
587
- for _ , task := range pendingTasks {
588
- // The code can produce `poolSize` number of batches at a time. But, it can consume only
589
- // `parallelism` number of batches at a time.
590
- batchImportPool = pool .New ().WithMaxGoroutines (poolSize )
591
- log .Infof ("created batch import pool of size: %d" , poolSize )
592
-
593
- taskImporter , err := NewFileTaskImporter (task , state , batchImportPool , progressReporter )
594
+ useTaskPicker := utils .GetEnvAsBool ("USE_TASK_PICKER_FOR_IMPORT" , true )
595
+ if useTaskPicker {
596
+ err := importTasksViaTaskPicker (pendingTasks , state , progressReporter , poolSize )
594
597
if err != nil {
595
- utils .ErrExit ("Failed to create file task importer : %s" , err )
598
+ utils .ErrExit ("Failed to import tasks via task picker : %s" , err )
596
599
}
600
+ } else {
601
+ for _ , task := range pendingTasks {
602
+ // The code can produce `poolSize` number of batches at a time. But, it can consume only
603
+ // `parallelism` number of batches at a time.
604
+ batchImportPool = pool .New ().WithMaxGoroutines (poolSize )
605
+ log .Infof ("created batch import pool of size: %d" , poolSize )
597
606
598
- for ! taskImporter .AllBatchesSubmitted () {
599
- err := taskImporter .SubmitNextBatch ()
607
+ taskImporter , err := NewFileTaskImporter (task , state , batchImportPool , progressReporter )
600
608
if err != nil {
601
- utils .ErrExit ("Failed to submit next batch: task:%v err : %s" , task , err )
609
+ utils .ErrExit ("Failed to create file task importer : %s" , err )
602
610
}
603
- }
604
611
605
- batchImportPool .Wait () // Wait for the file import to finish.
606
- taskImporter .PostProcess ()
612
+ for ! taskImporter .AllBatchesSubmitted () {
613
+ err := taskImporter .ProduceAndSubmitNextBatchToWorkerPool ()
614
+ if err != nil {
615
+ utils .ErrExit ("Failed to submit next batch: task:%v err: %s" , task , err )
616
+ }
617
+ }
618
+
619
+ batchImportPool .Wait () // wait for file import to finish
620
+ taskImporter .PostProcess ()
621
+ }
622
+ time .Sleep (time .Second * 2 )
607
623
}
608
- time .Sleep (time .Second * 2 )
609
624
}
610
625
utils .PrintAndLog ("snapshot data import complete\n \n " )
611
626
}
@@ -684,6 +699,75 @@ func importData(importFileTasks []*ImportFileTask) {
684
699
685
700
}
686
701
702
+ /*
703
+ 1. Initialize a worker pool
704
+ 2. Create a task picker which helps the importer choose which task to process in each iteration.
705
+ 3. Loop until all tasks are done:
706
+ - Pick a task from the task picker.
707
+ - If the task is not already being processed, create a new FileTaskImporter for the task.
708
+ - For the task that is picked, produce the next batch and submit it to the worker pool. Worker will asynchronously import the batch.
709
+ - If task is done, mark it as done in the task picker.
710
+ */
711
+ func importTasksViaTaskPicker (pendingTasks []* ImportFileTask , state * ImportDataState , progressReporter * ImportDataProgressReporter , poolSize int ) error {
712
+ // The code can produce `poolSize` number of batches at a time. But, it can consume only
713
+ // `parallelism` number of batches at a time.
714
+ batchImportPool = pool .New ().WithMaxGoroutines (poolSize )
715
+ log .Infof ("created batch import pool of size: %d" , poolSize )
716
+
717
+ taskPicker , err := NewSequentialTaskPicker (pendingTasks , state )
718
+ if err != nil {
719
+ return fmt .Errorf ("create task picker: %w" , err )
720
+ }
721
+ taskImporters := map [int ]* FileTaskImporter {}
722
+
723
+ for taskPicker .HasMoreTasks () {
724
+ task , err := taskPicker .Pick ()
725
+ if err != nil {
726
+ return fmt .Errorf ("get next task: %w" , err )
727
+ }
728
+ var taskImporter * FileTaskImporter
729
+ var ok bool
730
+ taskImporter , ok = taskImporters [task .ID ]
731
+ if ! ok {
732
+ taskImporter , err = NewFileTaskImporter (task , state , batchImportPool , progressReporter )
733
+ if err != nil {
734
+ return fmt .Errorf ("create file task importer: %s" , err )
735
+ }
736
+ log .Infof ("created file task importer for table: %s, task: %v" , task .TableNameTup .ForOutput (), task )
737
+ taskImporters [task .ID ] = taskImporter
738
+ }
739
+
740
+ if taskImporter .AllBatchesSubmitted () {
741
+ // All batches for this task have been submitted.
742
+ // task could have been completed (all batches imported) OR still in progress
743
+ // in case task is done, we should inform task picker so that we stop picking that task.
744
+ taskDone , err := taskImporter .AllBatchesImported ()
745
+ if err != nil {
746
+ return fmt .Errorf ("check if all batches are imported: task: %v err :%w" , task , err )
747
+ }
748
+ if taskDone {
749
+ taskImporter .PostProcess ()
750
+ err = taskPicker .MarkTaskAsDone (task )
751
+ if err != nil {
752
+ return fmt .Errorf ("mark task as done: task: %v, err: %w" , task , err )
753
+ }
754
+ continue
755
+ } else {
756
+ // some batches are still in progress, wait for them to complete as decided by the picker.
757
+ // don't want to busy-wait, so in case of sequentialTaskPicker, we sleep.
758
+ taskPicker .WaitForTasksBatchesTobeImported ()
759
+ continue
760
+ }
761
+
762
+ }
763
+ err = taskImporter .ProduceAndSubmitNextBatchToWorkerPool ()
764
+ if err != nil {
765
+ return fmt .Errorf ("submit next batch: task:%v err: %s" , task , err )
766
+ }
767
+ }
768
+ return nil
769
+ }
770
+
687
771
func startAdaptiveParallelism () (bool , error ) {
688
772
yb , ok := tdb .(* tgtdb.TargetYugabyteDB )
689
773
if ! ok {
0 commit comments