22from multiprocessing .pool import ThreadPool
33from typing import TYPE_CHECKING , Callable , Dict , List , Tuple
44
5- from cycode .cli .consts import (
6- SCAN_BATCH_MAX_FILES_COUNT ,
7- SCAN_BATCH_MAX_PARALLEL_SCANS ,
8- SCAN_BATCH_MAX_SIZE_IN_BYTES ,
9- SCAN_BATCH_SCANS_PER_CPU ,
10- )
5+ from cycode .cli import consts
116from cycode .cli .models import Document
127from cycode .cli .utils .progress_bar import ScanProgressBarSection
138
1813
1914def split_documents_into_batches (
2015 documents : List [Document ],
21- max_size_mb : int = SCAN_BATCH_MAX_SIZE_IN_BYTES ,
22- max_files_count : int = SCAN_BATCH_MAX_FILES_COUNT ,
16+ max_size : int = consts . DEFAULT_SCAN_BATCH_MAX_SIZE_IN_BYTES ,
17+ max_files_count : int = consts . DEFAULT_SCAN_BATCH_MAX_FILES_COUNT ,
2318) -> List [List [Document ]]:
2419 batches = []
2520
@@ -28,7 +23,7 @@ def split_documents_into_batches(
2823 for document in documents :
2924 document_size = len (document .content .encode ('UTF-8' ))
3025
31- if (current_size + document_size > max_size_mb ) or (len (current_batch ) >= max_files_count ):
26+ if (current_size + document_size > max_size ) or (len (current_batch ) >= max_files_count ):
3227 batches .append (current_batch )
3328
3429 current_batch = [document ]
@@ -45,17 +40,18 @@ def split_documents_into_batches(
4540
4641def _get_threads_count () -> int :
4742 cpu_count = os .cpu_count () or 1
48- return min (cpu_count * SCAN_BATCH_SCANS_PER_CPU , SCAN_BATCH_MAX_PARALLEL_SCANS )
43+ return min (cpu_count * consts . SCAN_BATCH_SCANS_PER_CPU , consts . SCAN_BATCH_MAX_PARALLEL_SCANS )
4944
5045
5146def run_parallel_batched_scan (
5247 scan_function : Callable [[List [Document ]], Tuple [str , 'CliError' , 'LocalScanResult' ]],
48+ scan_type : str ,
5349 documents : List [Document ],
5450 progress_bar : 'BaseProgressBar' ,
55- max_size_mb : int = SCAN_BATCH_MAX_SIZE_IN_BYTES ,
56- max_files_count : int = SCAN_BATCH_MAX_FILES_COUNT ,
5751) -> Tuple [Dict [str , 'CliError' ], List ['LocalScanResult' ]]:
58- batches = split_documents_into_batches (documents , max_size_mb , max_files_count )
52+ max_size = consts .SCAN_BATCH_MAX_SIZE_IN_BYTES .get (scan_type , consts .DEFAULT_SCAN_BATCH_MAX_SIZE_IN_BYTES )
53+ batches = split_documents_into_batches (documents , max_size )
54+
5955 progress_bar .set_section_length (ScanProgressBarSection .SCAN , len (batches )) # * 3
6056 # TODO(MarshalX): we should multiply the count of batches in SCAN section because each batch has 3 steps:
6157 # 1. scan creation
0 commit comments