4444from fastdeploy .metrics .metrics import main_process_metrics
4545from fastdeploy .multimodal .hasher import MultimodalHasher
4646from fastdeploy .platforms import current_platform
47- from fastdeploy .utils import llm_logger
47+ from fastdeploy .utils import download_from_bos , init_bos_client , llm_logger
4848
4949
5050@dataclass
@@ -195,6 +195,9 @@ def __init__(self, max_num_seqs, config, tensor_parallel_size, splitwise_role, l
195195 max_processor_cache_in_bytes = int (config .cache_config .max_processor_cache * 1024 * 1024 * 1024 )
196196 self .processor_cache = ProcessorCacheManager (max_processor_cache_in_bytes )
197197
198+ self .bos_client = None
199+ self .async_preprocess_pool = ThreadPoolExecutor (max_workers = 4 )
200+
198201 def allocated_slots (self , request : Request ):
199202 return len (request .block_tables ) * self .config .cache_config .block_size
200203
@@ -500,6 +503,7 @@ def schedule(self):
500503 with self .lock :
501504 scheduled_reqs : list [Request ] = []
502505 preempted_reqs : list [Request ] = []
506+ error_reqs : list [tuple [str , str ]] = []
503507 token_budget = self .config .scheduler_config .max_num_batched_tokens
504508
505509 # First, schedule the RUNNING requests.
@@ -629,6 +633,7 @@ def _allocate_decode_and_extend():
629633 req_index += 1
630634 # schedule the WAITING requests.
631635 if not preempted_reqs :
636+ skip_requests : list [Request ] = []
632637 while self .waiting and token_budget > 0 :
633638 if len (self .running ) == self .max_num_seqs :
634639 break
@@ -639,6 +644,17 @@ def _allocate_decode_and_extend():
639644 ):
640645 break
641646 if request .status == RequestStatus .WAITING :
647+ result = self ._waiting_async_process (request )
648+ if result is None :
649+ error_reqs .append ((request .request_id , request .error_message ))
650+ self .waiting .popleft ()
651+ continue
652+ elif result is True :
653+ # skip current request, try next request
654+ skip_requests .append (request )
655+ self .waiting .popleft ()
656+ continue
657+
642658 self ._update_mm_hashes (request )
643659 # Enable prefix caching
644660 if self .config .cache_config .enable_prefix_caching :
@@ -725,12 +741,102 @@ def _allocate_decode_and_extend():
725741 else :
726742 llm_logger .error ("Unknown request status type" )
727743
744+ for req in skip_requests :
745+ # move waiting request to end of the deque
746+ self .waiting .append (req )
747+
728748 if scheduled_reqs :
729749 llm_logger .debug (f"schedued_reqs: { scheduled_reqs } " )
730750
731751 self .update_metrics ()
732752
733- return scheduled_reqs
753+ return scheduled_reqs , error_reqs
754+
755+ def _waiting_async_process (self , request : Request ) -> None :
756+ """
757+ Check if async preprocessing is complete for a request.
758+ Args:
759+ request: The request to check
760+ Returns:
761+ None: If an error occurred during preprocessing
762+ True: If preprocessing is still in progress (request should be skipped)
763+ False: If preprocessing is complete (request can be scheduled)
764+ """
765+ for future in request .async_process_futures :
766+ if future .done ():
767+ if request .get ("error_message" ) is not None :
768+ return None
769+ else :
770+ return True
771+ request .async_process_futures = []
772+ return False
773+
774+ def _apply_async_preprocess (self , request : Request ) -> None :
775+ request .async_process_futures .append (self .async_preprocess_pool .submit (self ._download_features , request ))
776+
777+ def _has_features_info (self , task ):
778+ inputs = task .multimodal_inputs
779+ if inputs is None or len (inputs ) == 0 :
780+ return False
781+
782+ if (
783+ (inputs .get ("video_feature_urls" ) is not None and len (inputs ["video_feature_urls" ]) > 0 )
784+ or (inputs .get ("image_feature_urls" ) is not None and len (inputs ["image_feature_urls" ]) > 0 )
785+ or (inputs .get ("audio_feature_urls" ) is not None and len (inputs ["audio_feature_urls" ]) > 0 )
786+ ):
787+ return True
788+ return False
789+
790+ def _download_features (self , request : Request ) -> None :
791+ """
792+ download multimodal features from bos
793+ Note:
794+ 1. this function will be add features for request.multimodal_inputs
795+ 2. this function maybe update request.error_message and request.error_code
796+ Args:
797+ request (Request): request object
798+ """
799+
800+ def download_bos_features (bos_client , features_urls ):
801+ result_list = []
802+ for status , feature in download_from_bos (self .bos_client , features_urls ):
803+ if status :
804+ llm_logger .info (f"request { request .request_id } async download feature: { feature .shape } " )
805+ result_list .append (feature )
806+ else :
807+ error_msg = f"request { request .request_id } download features error: { feature } "
808+ llm_logger .error (error_msg )
809+ return error_msg
810+ return result_list
811+
812+ if not self .config .parallel_config .enable_async_download_features or not self ._has_features_info (request ):
813+ return None
814+
815+ if self .bos_client is None :
816+ self .bos_client = init_bos_client ()
817+
818+ inputs = request .multimodal_inputs
819+ if inputs .get ("video_feature_urls" ) is not None and len (inputs ["video_feature_urls" ]) > 0 :
820+ result = download_bos_features (self .bos_client , inputs ["video_feature_urls" ])
821+ if isinstance (result , str ): # download error
822+ request .error_message = result
823+ request .error_code = 530
824+ return None
825+ inputs ["video_features" ] = result
826+ if inputs .get ("image_feature_urls" ) is not None and len (inputs ["image_feature_urls" ]) > 0 :
827+ result = download_bos_features (self .bos_client , inputs ["image_feature_urls" ])
828+ if isinstance (result , str ): # download error
829+ request .error_message = result
830+ request .error_code = 530
831+ return None
832+ inputs ["image_features" ] = result
833+ if inputs .get ("audio_feature_urls" ) is not None and len (inputs ["audio_feature_urls" ]) > 0 :
834+ result = download_bos_features (self .bos_client , inputs ["audio_feature_urls" ])
835+ if isinstance (result , str ): # download error
836+ request .error_message = result
837+ request .error_code = 530
838+ return None
839+ inputs ["audio_features" ] = result
734840
735841 def get_available_position (self ) -> int :
736842 position = 0
@@ -788,6 +894,7 @@ def get_prefix_cached_blocks(self, request: Request):
788894
789895 def add_request (self , request : Request ) -> None :
790896 with self .lock :
897+ self ._apply_async_preprocess (request )
791898 self .waiting .append (request )
792899 self .requests [request .request_id ] = request
793900
0 commit comments