flagos-ai · ceci3 · Apr 6, 2026 · Mar 31, 2026 · Mar 31, 2026 · Apr 2, 2026
diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@ In theory, vllm-plugin-FL can support all models available in vLLM, as long as n
 
 ### Setup
 
-1. Install vllm from the official [v0.13.0](https://github.com/vllm-project/vllm/tree/v0.13.0) (optional if the correct version is installed) or from the fork [vllm-FL](https://github.com/flagos-ai/vllm-FL).
+1. Install vllm from the official [v0.18.1](https://github.com/vllm-project/vllm/tree/v0.18.1) (optional if the correct version is installed) or from the fork [vllm-FL](https://github.com/flagos-ai/vllm-FL).
 
 
 2. Install vllm-plugin-FL
@@ -66,6 +66,7 @@ In theory, vllm-plugin-FL can support all models available in vLLM, as long as n
 
     ```sh
     git clone https://github.com/flagos-ai/FlagGems
+    git checkout v5.0.0
     cd FlagGems
     pip install --no-build-isolation .
     # or editble install

diff --git a/tests/unit_tests/ops/test_layernorm.py b/tests/unit_tests/ops/test_layernorm.py
@@ -13,6 +13,11 @@
 class TestRMSNormFL:
     """Test RMSNormFL class behavior."""
 
+    def __init__(self):
+        from vllm.config import VllmConfig, set_current_vllm_config
+
+        set_current_vllm_config(VllmConfig())
+
     @pytest.fixture
     def mock_call_op(self):
         with patch("vllm_fl.ops.layernorm.call_op") as mock:

diff --git a/tests/unit_tests/worker/test_model_runner.py b/tests/unit_tests/worker/test_model_runner.py
@@ -60,6 +60,7 @@ def test_fields_match_expected_contract(self):
             "aux_hidden_states",
             "ec_connector_output",
             "cudagraph_stats",
+            "slot_mappings",
         )
         assert ExecuteModelState._fields == expected_fields, (
             "ExecuteModelState fields changed - this may break execute_model consumers"
@@ -79,6 +80,7 @@ def test_immutability_prevents_accidental_mutation(self):
             aux_hidden_states=None,
             ec_connector_output=None,
             cudagraph_stats=None,
+            slot_mappings=None,
         )
 
         with pytest.raises(AttributeError):
@@ -101,6 +103,7 @@ def test_unpacking_for_downstream_processing(self):
             aux_hidden_states=None,
             ec_connector_output=None,
             cudagraph_stats=None,
+            slot_mappings=None,
         )
 
         # Simulate downstream unpacking

diff --git a/vllm_fl/__init__.py b/vllm_fl/__init__.py
@@ -45,75 +45,19 @@ def register():
 
 
 def register_model():
-    """Register the FL model."""
-    from vllm import ModelRegistry
-    import vllm.model_executor.models.qwen3_next as qwen3_next_module
+    """Register FL-specific models not yet upstream."""
+    # Models now upstream in vLLM v0.18.1 (no longer need plugin registration):
+    #   Qwen3NextForCausalLM, Qwen3_5MoeForConditionalGeneration,
+    #   MiniCPMO, KimiK25ForConditionalGeneration, Qwen3_5MoeConfig
 
-    # Register Qwen3.5 MoE config
-    try:
-        from vllm.transformers_utils.config import _CONFIG_REGISTRY
-        from vllm_fl.configs.qwen3_5_moe import Qwen3_5MoeConfig
-        _CONFIG_REGISTRY["qwen3_5_moe"] = Qwen3_5MoeConfig
-    except Exception as e:
-        logger.error(f"Register Qwen3.5 MoE config error: {str(e)}")
-
-    # Register Qwen3Next model
-    try:
-        from vllm_fl.models.qwen3_next import Qwen3NextForCausalLM  # noqa: F401
-
-        qwen3_next_module.Qwen3NextForCausalLM = Qwen3NextForCausalLM
-        logger.warning(
-            "Qwen3NextForCausalLM has been patched to use vllm_fl.models.qwen3_next, "
-            "original vLLM implementation is overridden"
-        )
-
-        ModelRegistry.register_model(
-            "Qwen3NextForCausalLM",
-            "vllm_fl.models.qwen3_next:Qwen3NextForCausalLM"
-        )
-    except Exception as e:
-        logger.error(f"Register Qwen3Next model error: {str(e)}")
-
-    # Register Qwen3.5 MoE model
-    try:
-        ModelRegistry.register_model(
-            "Qwen3_5MoeForConditionalGeneration",
-            "vllm_fl.models.qwen3_5:Qwen3_5MoeForConditionalGeneration"
-        )
-    except Exception as e:
-        logger.error(f"Register Qwen3.5 MoE model error: {str(e)}")
-
-    # Register MiniCPMO model
-    try:
-        ModelRegistry.register_model(
-            "MiniCPMO",
-            "vllm_fl.models.minicpmo:MiniCPMO"
-        )
-    except Exception as e:
-        logger.error(f"Register MiniCPMO model error: {str(e)}")
-
-    # Register Kimi-K2.5 model
-    try:
-        ModelRegistry.register_model(
-            "KimiK25ForConditionalGeneration",
-            "vllm_fl.models.kimi_k25:KimiK25ForConditionalGeneration",
-        )
-    except Exception as e:
-        logger.error(f"Register KimiK25 model error: {str(e)}")
-
-    # Register GLM-5 (GlmMoeDsa) model
+    # Register GLM-5 (GlmMoeDsa) — config not yet upstream
     try:
         from vllm.transformers_utils.config import _CONFIG_REGISTRY
         from vllm_fl.configs.glm_moe_dsa import GlmMoeDsaConfig
         _CONFIG_REGISTRY["glm_moe_dsa"] = GlmMoeDsaConfig
 
-        from vllm_fl.patches.glm_moe_dsa import apply_model_patches as glm5_model
-        glm5_model()
-
-        ModelRegistry.register_model(
-            "GlmMoeDsaForCausalLM",
-            "vllm_fl.models.glm_moe_dsa:GlmMoeDsaForCausalLM"
-        )
+        #from vllm_fl.patches.glm_moe_dsa import apply_model_patches as glm5_model
+        #glm5_model()
     except Exception as e:
         logger.error(f"Register GlmMoeDsa model error: {str(e)}")
 

diff --git a/vllm_fl/attention/utils.py b/vllm_fl/attention/utils.py
@@ -13,8 +13,8 @@ def patch_mm_encoder_attention():
     FLASH_ATTN branch to import directly from vllm.vllm_flash_attn with a
     fallback to flash_attn.
     """
-    import vllm.attention.layers.mm_encoder_attention as mm_mod
-    from vllm.attention.backends.registry import AttentionBackendEnum
+    import vllm.model_executor.layers.attention.mm_encoder_attention as mm_mod
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
     def _patched_maybe_get_vit_flash_attn_backend(attn_backend):
         if attn_backend == AttentionBackendEnum.FLASH_ATTN: