vllm-project · whitememory · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 2, 2025
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
@@ -41,6 +41,7 @@ th {
 | flashinfer<sup>4</sup>                | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize]   |
 | MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard           | fp8,int8        | G,A,T                  | N     | Y                     | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP]                                                |
 | BatchedPrepareAndFinalize<sup>5</sup> | batched            | fp8,int8        | G,A,T                  | N     | Y                     | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize]                                               |
+| MoriPrepareAndFinalize<sup>7</sup>    | standard           | fp8<sup>8</sup> | G(128),A,T<sup>8</sup> |N     | Y                     | [`MoriPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.mori_prepare_finalize.MoriPrepareAndFinalize]                                              |
 
 !!! info "Table key"
     1. All types: mxfp4, nvfp4, int4, int8, fp8
@@ -49,6 +50,8 @@ th {
     4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency")
     5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine.  These cannot be selected via environment variable.  These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
     6. This depends on the experts implementation.
+    7. Currently, MoRI supports low-latency mode only.
+    8. This depends on the experts implementation, currently mori supports aiter.
 
     ---
 
@@ -118,3 +121,4 @@ The following table shows "families" of modular kernels that are intended to wor
 | deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`      |
 | deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`|
 | flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                        |
+| mori                             | `MoriPrepareAndFinalize`                                   | `AiterExperts`                                                                                                        |
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
@@ -9,7 +9,7 @@
 from vllm.distributed import get_dp_group, get_ep_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
-from vllm.utils import has_deep_ep, has_pplx
+from vllm.utils import has_deep_ep, has_mori, has_pplx
 from vllm.utils.flashinfer import has_flashinfer_all2all
 
 from .base_device_communicator import All2AllManagerBase, Cache
@@ -474,3 +474,212 @@ def cleanup(self):
                 self.prepare_workspace_tensor = None
                 self.mapping = None
                 self.initialized = False
+
+
+class MoriAll2AllManager(All2AllManagerBase):
+    """
+    All2All communication based on mori kernels.
+    """
+
+    def __init__(self, cpu_group):
+        assert has_mori(), "Please install mori from ROCm/mori github."
+
+        super().__init__(cpu_group)
+        self.handle_cache = Cache()
+        self.config = None
+        self._shmem_initialized = False
+        # Delay mori shmem initialization until first use
+        logger.debug("[rank %s] MoriAll2AllManager created", self.rank)
+
+    def _ensure_shmem_initialized(self):
+        """Initialize mori's shared memory system lazily"""
+        if self._shmem_initialized:
+            return
+
+        import mori.shmem
+        import torch.distributed as dist
+
+        try:
+            # Check if we have a valid backend
+            backend = dist.get_backend()
+            if backend is None:
+                raise RuntimeError("No valid distributed backend found")
+
+            logger.debug(
+                "[rank %s] PyTorch distributed ready with backend: %s",
+                self.rank,
+                backend,
+            )
+
+            current_group = (
+                self.cpu_group if self.cpu_group is not None else dist.group.WORLD
+            )
+            group_name = "mori_shmem_group"
+
+            try:
+                import torch._C._distributed_c10d as c10d
+
+                # Register the current process group
+                c10d._register_process_group(group_name, current_group)
+                logger.debug(
+                    "[rank %s] Registered proc group %s", self.rank, group_name
+                )
+
+                # Initialize mori shmem with the registered group
+                mori.shmem.shmem_torch_process_group_init(group_name)
+                logger.debug("[rank %s] torch proc group shmem init success", self.rank)
+                self._shmem_initialized = True
+                return
+
+            except Exception as torch_error:
+                logger.debug(
+                    "[rank %s] torch process group shmem init failed: %s",
+                    self.rank,
+                    torch_error,
+                )
+                self._shmem_initialized = True
+                logger.warning(
+                    "[rank %s] Continue without mori shmem optimize", self.rank
+                )
+
+        except Exception as e:
+            logger.error("[rank %s] mori shmem init failed: %s", self.rank, e)
+            # Don't fail completely - mark as initialized to avoid retry loops
+            self._shmem_initialized = True
+            logger.warning(
+                "[rank %s] Continuing without mori shmem optimize", self.rank
+            )
+
+    def _make_mori_config(
+        self,
+        max_num_tokens: int,
+        num_local_experts: int,
+        experts_per_token: int,
+        hidden_dim: int,
+        scale_dim: int,
+        scale_type_size: int,
+        data_type: torch.dtype = torch.bfloat16,
+        quant_dtype: Optional[torch.dtype] = None,
+    ):
+        """Create mori EpDispatchCombineConfig"""
+        import mori.ops.dispatch_combine as mori_ops
+        from mori.ops.dispatch_combine import EpDispatchCombineKernelType
+
+        config = mori_ops.EpDispatchCombineConfig(
+            data_type=data_type if quant_dtype is None else quant_dtype,
+            rank=self.rank,
+            world_size=self.world_size,
+            hidden_dim=hidden_dim,
+            max_num_inp_token_per_rank=max_num_tokens,
+            num_experts_per_rank=num_local_experts,
+            num_experts_per_token=experts_per_token,
+            # Performance tuning parameters
+            # warp_num_per_block=8,
+            # block_num=80,
+            max_token_type_size=data_type.itemsize,
+            # Quantization support
+            scale_dim=scale_dim,
+            scale_type_size=scale_type_size,
+            # Determine kernel type based on topology
+            kernel_type=(
+                EpDispatchCombineKernelType.InterNode
+                if self.internode
+                else EpDispatchCombineKernelType.IntraNode
+            ),
+        )
+
+        return config
+
+    def get_handle(self, kwargs):
+        """
+        Get or create mori operation handle.
+        Args:
+            kwargs: Dictionary with keys:
+                - max_num_tokens: Maximum tokens per DP rank
+                - num_local_experts: Number of local experts
+                - experts_per_token: Number of experts per token (topk)
+                - hidden_dim: Hidden dimension size
+                - data_type: Tensor data type (optional, default bfloat16)
+        """
+        # Ensure shmem is initialized before creating handles
+        self._ensure_shmem_initialized()
+
+        def create_mori_handle(
+            max_num_tokens: int,
+            num_local_experts: int,
+            experts_per_token: int,
+            hidden_dim: int,
+            scale_dim: int,
+            scale_type_size: int,
+            data_type: torch.dtype = torch.bfloat16,
+            quant_dtype: Optional[torch.dtype] = None,
+        ):
+            import mori
+
+            config = self._make_mori_config(
+                max_num_tokens=max_num_tokens,
+                num_local_experts=num_local_experts,
+                experts_per_token=experts_per_token,
+                hidden_dim=hidden_dim,
+                scale_dim=scale_dim,
+                scale_type_size=scale_type_size,
+                data_type=data_type,
+                quant_dtype=quant_dtype,
+            )
+            op = mori.ops.EpDispatchCombineOp(config)
+            logger.debug(
+                "[rank %s] Created mori handle with config: tokens=%d, experts=%d,"
+                " topk=%d, hidden_dim=%d",
+                self.dp_rank,
+                max_num_tokens,
+                num_local_experts,
+                experts_per_token,
+                hidden_dim,
+            )
+            return op
+
+        return self.handle_cache.get_or_create(kwargs, create_mori_handle)
+
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ):
+        raise NotImplementedError
+
+    def combine(
+        self,
+        hidden_states: torch.Tensor,
+        is_sequence_parallel: bool = False,
+    ):
+        raise NotImplementedError
+
+    def destroy(self):
+        """Clean up mori resources"""
+        try:
+            # Clear operation handle cache
+            with self.handle_cache._lock:
+                for _, handle in self.handle_cache._cache.items():
+                    handle.destroy()
+
+            # finalize mori shared memory if it was initialized
+            if self._shmem_initialized:
+                try:
+                    import mori.shmem
+
+                    # Check if shmem is actually active before finalizing
+                    mori.shmem.shmem_finalize()
+                    logger.debug("[rank %s] mori shmem finalize", self.dp_rank)
+                except Exception as shmem_error:
+                    logger.debug(
+                        "[rank %s] shmem finalize failed "
+                        "(may not have been active): %s",
+                        self.dp_rank,
+                        shmem_error,
+                    )
+
+            logger.debug("[rank %s] mori resources cleaned up", self.dp_rank)
+
+        except Exception as e:
+            logger.warning("[rank %s] mori cleanup fail: %s", self.dp_rank, e)
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -8,6 +8,10 @@
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 class Cache:
     def __init__(self):

diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -122,6 +122,11 @@ def __init__(
 
                 self.all2all_manager = FlashInferAllToAllManager(self.cpu_group)
                 logger.info("Using Flashinfer all2allv manager.")
+            elif all2all_backend == "mori":
+                from .all2all import MoriAll2AllManager
+
+                self.all2all_manager = MoriAll2AllManager(self.cpu_group)
+                logger.info("Using Mori all2all manager.")
             else:
                 raise ValueError(f"Unknown all2all backend: {all2all_backend}")
 

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -157,6 +157,7 @@
     VLLM_ALL2ALL_BACKEND: Literal[
         "naive",
         "pplx",
+        "mori",
         "deepep_high_throughput",
         "deepep_low_latency",
         "allgather_reducescatter",
@@ -1141,6 +1142,7 @@ def get_vllm_port() -> Optional[int]:
     # - "allgather_reducescatter": all2all implementation based on allgather and
     #  reducescatter
     # - "pplx": use pplx kernels
+    # - "mori": use mori kernels (currently, only low-latency is supported)
     # - "deepep_high_throughput", use deepep high-throughput kernels
     # - "deepep_low_latency", use deepep low-latency kernels
     # - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl
@@ -1150,6 +1152,7 @@ def get_vllm_port() -> Optional[int]:
         [
             "naive",
             "pplx",
+            "mori",
             "deepep_high_throughput",
             "deepep_low_latency",
             "allgather_reducescatter",

@@ -4,6 +4,7 @@
 from contextlib import contextmanager
 from typing import Any, Optional
 
+from vllm.model_executor.layers.fused_moe.aiter_experts import AiterExperts
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
@@ -94,6 +95,7 @@ def get_config() -> Optional[dict[str, Any]]:
         "BatchedDeepGemmExperts",
         "TritonOrDeepGemmExperts",
         "BatchedTritonOrDeepGemmExperts",
+        "AiterExperts",
     ]
 else:
     # Some model classes directly use the custom ops. Add placeholders