@@ -64,41 +64,8 @@ def slice_lora_a(self, lora_a):
6464 MergedColumnParallelLinearWithShardedLoRA .slice_lora_a = slice_lora_a
6565
6666
67- # Monkeypatch PrometheusStatLogger to avoid NotImplementedError for LoRA in DP mode
68- def monkey_patch_prometheus_stat_logger_for_lora_in_dp_mode ():
69- from vllm .v1 .metrics import loggers as vllm_metrics_loggers
70-
71- _original_prometheus_stat_logger_init = vllm_metrics_loggers .PrometheusStatLogger .__init__
72-
73- def _patched_prometheus_stat_logger_init (self , vllm_config , engine_indexes = None ):
74- """Patched init that temporarily disables lora_config to skip the DP mode check."""
75- original_lora_config = vllm_config .lora_config
76- vllm_config .lora_config = None
77- try :
78- _original_prometheus_stat_logger_init (self , vllm_config , engine_indexes )
79- finally :
80- vllm_config .lora_config = original_lora_config
81- # Re-initialize LoRA metrics if needed (after the DP check is bypassed)
82- if original_lora_config is not None :
83- self .labelname_max_lora = "max_lora"
84- self .labelname_waiting_lora_adapters = "waiting_lora_adapters"
85- self .labelname_running_lora_adapters = "running_lora_adapters"
86- self .max_lora = original_lora_config .max_loras
87- self .gauge_lora_info = vllm_metrics_loggers .PrometheusStatLogger ._gauge_cls (
88- name = "vllm:lora_requests_info" ,
89- documentation = "Running stats on lora requests." ,
90- multiprocess_mode = "sum" ,
91- labelnames = [
92- self .labelname_max_lora ,
93- self .labelname_waiting_lora_adapters ,
94- self .labelname_running_lora_adapters ,
95- ],
96- )
97-
98- vllm_metrics_loggers .PrometheusStatLogger .__init__ = _patched_prometheus_stat_logger_init
99-
100-
10167# Monkeypatch LoadLoRAAdapter to allow loading the same adapter multiple times
68+ # TODO: may be removable if we pass load_inplace=True (supported since vLLM 0.18, PR #31326)
10269def monkey_patch_load_lora_adapter ():
10370 from http import HTTPStatus
10471
@@ -153,6 +120,7 @@ async def _patched_load_lora_adapter(
153120
154121
155122# Monkeypatch LRUCacheWorkerLoRAManager to allow loading adapter inplace without doing it every request
123+ # TODO: may be removable if we pass load_inplace=True (supported since vLLM 0.18, PR #31326)
156124def monkey_patch_LRUCacheWorkerLoRAManager ():
157125 from vllm .lora .worker_manager import LoRARequest , LRUCacheLoRAModelManager , LRUCacheWorkerLoRAManager
158126
@@ -278,109 +246,6 @@ def _patched_get_encode_kwargs(self):
278246 TokenizeParams .get_encode_kwargs = _patched_get_encode_kwargs
279247
280248
281- def monkey_patch_hermes_tool_parser_thread_safety ():
282- """Patch Hermes2ProToolParser to cache tokenizer encode/decode results.
283-
284- The original __init__ calls tokenizer.encode() and tokenizer.decode() on
285- every instantiation. Under concurrent load, the shared HuggingFace tokenizer's
286- Rust backend panics with ``RuntimeError: Already borrowed`` because multiple
287- threads mutably borrow the same internal state simultaneously.
288-
289- Fix: run the first __init__ (which calls encode/decode) under a lock, cache
290- the results, and reuse them for all subsequent instantiations without ever
291- touching the tokenizer again.
292- """
293- import threading
294-
295- import regex as re
296- from vllm .tool_parsers .abstract_tool_parser import ToolParser
297- from vllm .tool_parsers .hermes_tool_parser import Hermes2ProToolParser
298-
299- _original_init = Hermes2ProToolParser .__init__
300- _cache : dict [int , dict ] = {}
301- _lock = threading .Lock ()
302-
303- def _patched_init (self , tokenizer ):
304- from vllm .tokenizers .mistral import MistralTokenizer
305-
306- # Resolve the actual tokenizer that __init__ will use for encode/decode
307- actual_tokenizer = tokenizer .tokenizer if isinstance (tokenizer , MistralTokenizer ) else tokenizer
308- key = id (actual_tokenizer )
309-
310- if key in _cache :
311- # Fast path: skip encode/decode entirely, set up instance from cache
312- ToolParser .__init__ (self , tokenizer )
313- if isinstance (tokenizer , MistralTokenizer ):
314- self .model_tokenizer = tokenizer .tokenizer
315- self .current_tool_name_sent = False
316- self .prev_tool_call_arr = []
317- self .current_tool_id = - 1
318- self .streamed_args_for_tool = []
319- self .tool_call_start_token = "<tool_call>"
320- self .tool_call_end_token = "</tool_call>"
321- self .tool_call_regex = re .compile (r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)" , re .DOTALL )
322- self .scratch_pad_regex = re .compile (r"<scratch_pad>(.*?)</scratch_pad>" , re .DOTALL )
323- cached = _cache [key ]
324- self .tool_call_start_token_ids = cached ["start_ids" ]
325- self .tool_call_end_token_ids = cached ["end_ids" ]
326- self .tool_call_start_token_array = cached ["start_array" ]
327- self .tool_call_end_token_array = cached ["end_array" ]
328- self .buffered_delta_text = ""
329- return
330-
331- # Slow path: first instantiation for this tokenizer, run under lock
332- with _lock :
333- if key in _cache :
334- # Another thread populated it while we waited
335- _patched_init (self , tokenizer )
336- return
337- _original_init (self , tokenizer )
338- _cache [key ] = {
339- "start_ids" : self .tool_call_start_token_ids ,
340- "end_ids" : self .tool_call_end_token_ids ,
341- "start_array" : self .tool_call_start_token_array ,
342- "end_array" : self .tool_call_end_token_array ,
343- }
344-
345- Hermes2ProToolParser .__init__ = _patched_init
346-
347-
348- def monkey_patch_tokenizer_thread_safety ():
349- """Patch HuggingFace tokenizer to make _encode_plus thread-safe.
350-
351- Under concurrent request load, vLLM's API server calls _encode_plus from
352- multiple async handlers simultaneously. _encode_plus mutates the Rust
353- tokenizer's internal state via set_truncation_and_padding (enable_truncation/
354- enable_padding) and encode_special_tokens. The Rust backend uses RefCell-style
355- borrow tracking (PyO3), and concurrent mutable borrows cause it to panic
356- with ``RuntimeError: Already borrowed``.
357-
358- Fix: wrap the entire _encode_plus method in a per-tokenizer threading lock
359- so that state mutation and the subsequent encode call are atomic.
360- """
361- import threading
362-
363- from transformers import PreTrainedTokenizerFast
364-
365- _original_encode_plus = PreTrainedTokenizerFast ._encode_plus
366- _locks : dict [int , threading .Lock ] = {}
367- _meta_lock = threading .Lock ()
368-
369- def _get_lock (tokenizer_id : int ) -> threading .Lock :
370- if tokenizer_id not in _locks :
371- with _meta_lock :
372- if tokenizer_id not in _locks :
373- _locks [tokenizer_id ] = threading .Lock ()
374- return _locks [tokenizer_id ]
375-
376- def _patched_encode_plus (self , * args , ** kwargs ):
377- lock = _get_lock (id (self ._tokenizer ))
378- with lock :
379- return _original_encode_plus (self , * args , ** kwargs )
380-
381- PreTrainedTokenizerFast ._encode_plus = _patched_encode_plus
382-
383-
384249def monkey_patch_minimax_m2_for_lora ():
385250 """Patch vLLM's MiniMaxM2 model for LoRA compatibility.
386251
@@ -457,7 +322,7 @@ def _patched_forward(self, hidden_states):
457322
458323
459324def monkey_patch_harmony_stop_token_propagation ():
460- """Fix: vLLM 0.17.0 doesn't merge harmony stop tokens into per-request SamplingParams.
325+ """Fix: vLLM doesn't merge harmony stop tokens into per-request SamplingParams.
461326
462327 The harmony mode sets stop_token_ids (including <|call|> and <|return|>) in
463328 default_sampling_params at server init, but ChatCompletionRequest.to_sampling_params()
0 commit comments