Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ In theory, vllm-plugin-FL can support all models available in vLLM, as long as n

### Setup

1. Install vllm from the official [v0.13.0](https://github.com/vllm-project/vllm/tree/v0.13.0) (optional if the correct version is installed) or from the fork [vllm-FL](https://github.com/flagos-ai/vllm-FL).
1. Install vllm from the official [v0.18.1](https://github.com/vllm-project/vllm/tree/v0.18.1) (optional if the correct version is installed) or from the fork [vllm-FL](https://github.com/flagos-ai/vllm-FL).

Comment on lines 35 to 39
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR title/description says upgrade to vLLM 0.18.0, but this change updates docs/source references to v0.18.1 (and several file headers also reference v0.18.1). Please align the stated target version (either update PR metadata to 0.18.1 or change the references back to 0.18.0) to avoid confusion about the required dependency version.

Copilot uses AI. Check for mistakes.

2. Install vllm-plugin-FL
Expand Down Expand Up @@ -66,6 +66,7 @@ In theory, vllm-plugin-FL can support all models available in vLLM, as long as n

```sh
git clone https://github.com/flagos-ai/FlagGems
git checkout v5.0.0
cd FlagGems
pip install --no-build-isolation .
# or editble install
Expand Down
5 changes: 5 additions & 0 deletions tests/unit_tests/ops/test_layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
class TestRMSNormFL:
"""Test RMSNormFL class behavior."""

def __init__(self):
from vllm.config import VllmConfig, set_current_vllm_config

set_current_vllm_config(VllmConfig())

@pytest.fixture
def mock_call_op(self):
with patch("vllm_fl.ops.layernorm.call_op") as mock:
Expand Down
3 changes: 3 additions & 0 deletions tests/unit_tests/worker/test_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def test_fields_match_expected_contract(self):
"aux_hidden_states",
"ec_connector_output",
"cudagraph_stats",
"slot_mappings",
)
assert ExecuteModelState._fields == expected_fields, (
"ExecuteModelState fields changed - this may break execute_model consumers"
Expand All @@ -79,6 +80,7 @@ def test_immutability_prevents_accidental_mutation(self):
aux_hidden_states=None,
ec_connector_output=None,
cudagraph_stats=None,
slot_mappings=None,
)

with pytest.raises(AttributeError):
Expand All @@ -101,6 +103,7 @@ def test_unpacking_for_downstream_processing(self):
aux_hidden_states=None,
ec_connector_output=None,
cudagraph_stats=None,
slot_mappings=None,
)

# Simulate downstream unpacking
Expand Down
70 changes: 7 additions & 63 deletions vllm_fl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,75 +45,19 @@ def register():


def register_model():
"""Register the FL model."""
from vllm import ModelRegistry
import vllm.model_executor.models.qwen3_next as qwen3_next_module
"""Register FL-specific models not yet upstream."""
# Models now upstream in vLLM v0.18.1 (no longer need plugin registration):
# Qwen3NextForCausalLM, Qwen3_5MoeForConditionalGeneration,
# MiniCPMO, KimiK25ForConditionalGeneration, Qwen3_5MoeConfig

# Register Qwen3.5 MoE config
try:
from vllm.transformers_utils.config import _CONFIG_REGISTRY
from vllm_fl.configs.qwen3_5_moe import Qwen3_5MoeConfig
_CONFIG_REGISTRY["qwen3_5_moe"] = Qwen3_5MoeConfig
except Exception as e:
logger.error(f"Register Qwen3.5 MoE config error: {str(e)}")

# Register Qwen3Next model
try:
from vllm_fl.models.qwen3_next import Qwen3NextForCausalLM # noqa: F401

qwen3_next_module.Qwen3NextForCausalLM = Qwen3NextForCausalLM
logger.warning(
"Qwen3NextForCausalLM has been patched to use vllm_fl.models.qwen3_next, "
"original vLLM implementation is overridden"
)

ModelRegistry.register_model(
"Qwen3NextForCausalLM",
"vllm_fl.models.qwen3_next:Qwen3NextForCausalLM"
)
except Exception as e:
logger.error(f"Register Qwen3Next model error: {str(e)}")

# Register Qwen3.5 MoE model
try:
ModelRegistry.register_model(
"Qwen3_5MoeForConditionalGeneration",
"vllm_fl.models.qwen3_5:Qwen3_5MoeForConditionalGeneration"
)
except Exception as e:
logger.error(f"Register Qwen3.5 MoE model error: {str(e)}")

# Register MiniCPMO model
try:
ModelRegistry.register_model(
"MiniCPMO",
"vllm_fl.models.minicpmo:MiniCPMO"
)
except Exception as e:
logger.error(f"Register MiniCPMO model error: {str(e)}")

# Register Kimi-K2.5 model
try:
ModelRegistry.register_model(
"KimiK25ForConditionalGeneration",
"vllm_fl.models.kimi_k25:KimiK25ForConditionalGeneration",
)
except Exception as e:
logger.error(f"Register KimiK25 model error: {str(e)}")

# Register GLM-5 (GlmMoeDsa) model
# Register GLM-5 (GlmMoeDsa) — config not yet upstream
try:
from vllm.transformers_utils.config import _CONFIG_REGISTRY
from vllm_fl.configs.glm_moe_dsa import GlmMoeDsaConfig
_CONFIG_REGISTRY["glm_moe_dsa"] = GlmMoeDsaConfig

from vllm_fl.patches.glm_moe_dsa import apply_model_patches as glm5_model
glm5_model()

ModelRegistry.register_model(
"GlmMoeDsaForCausalLM",
"vllm_fl.models.glm_moe_dsa:GlmMoeDsaForCausalLM"
)
#from vllm_fl.patches.glm_moe_dsa import apply_model_patches as glm5_model
#glm5_model()
except Exception as e:
logger.error(f"Register GlmMoeDsa model error: {str(e)}")

Expand Down
4 changes: 2 additions & 2 deletions vllm_fl/attention/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ def patch_mm_encoder_attention():
FLASH_ATTN branch to import directly from vllm.vllm_flash_attn with a
fallback to flash_attn.
"""
import vllm.attention.layers.mm_encoder_attention as mm_mod
from vllm.attention.backends.registry import AttentionBackendEnum
import vllm.model_executor.layers.attention.mm_encoder_attention as mm_mod
from vllm.v1.attention.backends.registry import AttentionBackendEnum

def _patched_maybe_get_vit_flash_attn_backend(attn_backend):
if attn_backend == AttentionBackendEnum.FLASH_ATTN:
Expand Down
Loading
Loading