File tree 1 file changed +9
-8
lines changed
server/text_generation_server/models
1 file changed +9
-8
lines changed Original file line number Diff line number Diff line change 83
83
84
84
try :
85
85
from text_generation_server .models .flash_causal_lm import FlashCausalLM
86
- from text_generation_server .models .transformers_flash_causal_lm import (
87
- TransformersFlashCausalLM ,
88
- )
89
86
from text_generation_server .models .vlm_causal_lm import VlmCausalLM
90
87
from text_generation_server .models .mllama_causal_lm import MllamaCausalLM
91
88
from text_generation_server .models .custom_modeling .flash_deepseek_v2_modeling import (
180
177
if MAMBA_AVAILABLE :
181
178
__all__ .append (Mamba )
182
179
180
+ FLASH_TRANSFORMERS_BACKEND = True
181
+ try :
182
+ from text_generation_server .models .transformers_flash_causal_lm import (
183
+ TransformersFlashCausalLM ,
184
+ )
185
+ except ImportError :
186
+ FLASH_TRANSFORMERS_BACKEND = False
187
+
183
188
184
189
class ModelType (enum .Enum ):
185
190
DEEPSEEK_V2 = {
@@ -384,12 +389,8 @@ def get_model(
384
389
transformers , modeling_auto .MODEL_FOR_CAUSAL_LM_MAPPING_NAMES [model_type ]
385
390
)
386
391
387
- if FLASH_ATTENTION and transformers_model_class ._supports_flex_attn :
392
+ if FLASH_TRANSFORMERS_BACKEND and transformers_model_class ._supports_flex_attn :
388
393
transformers_causal_lm_class = TransformersFlashCausalLM
389
- if lora_adapter_ids is not None and len (lora_adapter_ids ) > 0 :
390
- raise ValueError (
391
- "Flash `Transformers` modeling backend does not support `lora_adapter_ids`."
392
- )
393
394
394
395
quantization_config = config_dict .get ("quantization_config" , None )
395
396
if quantization_config is None :
You can’t perform that action at this time.
0 commit comments