File tree 1 file changed +13
-0
lines changed
1 file changed +13
-0
lines changed Original file line number Diff line number Diff line change 10
10
from data_juicer import cuda_device_count
11
11
from data_juicer .core .data import DJDataset
12
12
from data_juicer .ops import Deduplicator , Filter , Mapper
13
+ from data_juicer .ops .base_op import TAGGING_OPS
13
14
from data_juicer .utils .constant import Fields
14
15
from data_juicer .utils .lazy_loader import LazyLoader
15
16
from data_juicer .utils .process_utils import calculate_np
@@ -108,6 +109,18 @@ def _run_single_op(self, op):
108
109
op_proc = calculate_np (op ._name , op .mem_required , op .cpu_required ,
109
110
self .num_proc , op .use_cuda ())
110
111
num_gpus = get_num_gpus (op , op_proc )
112
+
113
+ if op ._name in TAGGING_OPS .modules and Fields .meta not in self .data .columns (
114
+ ):
115
+
116
+ def process_batch_arrow (table : pyarrow .Table ):
117
+ new_column_data = [{} for _ in range (len (table ))]
118
+ new_talbe = table .append_column (Fields .meta , [new_column_data ])
119
+ return new_talbe
120
+
121
+ self .data = self .data .map_batches (process_batch_arrow ,
122
+ batch_format = 'pyarrow' )
123
+
111
124
try :
112
125
batch_size = getattr (op , 'batch_size' ,
113
126
1 ) if op .is_batched_op () else 1
You can’t perform that action at this time.
0 commit comments