lightseekorg · key4ng · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 30, 2026
@@ -0,0 +1,129 @@
+name: PR Test (MLX)
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "crates/grpc_client/proto/mlx_engine.proto"
+      - "crates/grpc_client/src/mlx_engine.rs"
+      - "crates/grpc_client/python/**"
+      - "grpc_servicer/smg_grpc_servicer/mlx/**"
+      - "grpc_servicer/pyproject.toml"
+      - "e2e_test/mlx/test_mlx_backend.py"
+      - "e2e_test/infra/__init__.py"
+      - "e2e_test/infra/model_specs.py"
+      - "e2e_test/infra/worker.py"
+      - "e2e_test/infra/constants.py"
+      - ".github/workflows/pr-test-mlx.yml"
+  pull_request:
+    branches: [main]
+    types: [opened, synchronize, reopened]
+    paths:
+      - "crates/grpc_client/proto/mlx_engine.proto"
+      - "crates/grpc_client/src/mlx_engine.rs"
+      - "crates/grpc_client/python/**"
+      - "grpc_servicer/smg_grpc_servicer/mlx/**"
+      - "grpc_servicer/pyproject.toml"
+      - "e2e_test/mlx/test_mlx_backend.py"
+      - "e2e_test/infra/__init__.py"
+      - "e2e_test/infra/model_specs.py"
+      - "e2e_test/infra/worker.py"
+      - "e2e_test/infra/constants.py"
+      - ".github/workflows/pr-test-mlx.yml"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: mlx-tests-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  e2e-mlx:
+    name: E2E (MLX on Apple Silicon)
+    runs-on: macos-latest
+    timeout-minutes: 30
+    permissions:
+      contents: read
+    env:
+      E2E_RUNTIME: mlx
+      E2E_ENGINE: mlx
+      PYTHONUNBUFFERED: "1"
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install protoc
+        run: brew install protobuf
+
+      - name: Cache Cargo registry and target
+        uses: Swatinem/rust-cache@v2
+        with:
+          shared-key: mlx-pr-test
+          cache-on-failure: true
+
+      - name: Install uv (for openapi codegen)
+        run: pip install uv
+
+      - name: Build smg-grpc-proto Python package (proto codegen)
+        run: pip install -e ./crates/grpc_client/python
+
+      - name: Install grpc_servicer with MLX extra (mlx + mlx-lm)
+        run: pip install -e "./grpc_servicer[mlx]"
+
+      - name: Build and install SMG Python bindings (ci profile)
+        working-directory: bindings/python
+        run: |
+          pip install maturin
+          # `ci` profile (opt-level=2, thin LTO, 16 codegen-units) — faster
+          # to compile than release, runtime still plenty fast for a
+          # correctness E2E test.
+          # Use `maturin build` + `pip install` (not `maturin develop`)
+          # because the GitHub-hosted runner's Python is not in a virtualenv.
+          maturin build --profile ci --out dist
+          pip install dist/*.whl
+
+      - name: Generate Python client types (required by e2e_test/conftest.py)
+        run: make generate-python-types
+
+      - name: Install smg-client
+        run: pip install ./clients/python
+
+      - name: Install E2E test dependencies
+        run: pip install ./e2e_test
+
+      - name: Verify imports
+        run: |
+          python -c "from smg_grpc_proto import mlx_engine_pb2, mlx_engine_pb2_grpc; print('proto OK')"
+          python -c "from smg_grpc_servicer.mlx.servicer import MlxEngineServicer; print('servicer OK')"
+          python -c "import smg; print('smg OK')"
+          python -c "from smg_client import SmgClient; print('smg_client OK')"
+          python -c "import mlx_lm; print('mlx-lm OK')"
+
+      - name: Run MLX E2E tests
+        env:
+          SHOW_WORKER_LOGS: "1"
+          SHOW_ROUTER_LOGS: "1"
+          E2E_LOG_DIR: e2e-logs
+        run: |
+          pytest e2e_test/mlx/test_mlx_backend.py \
+            -s -vv \
+            --reruns 1 --reruns-delay 5
+
+      - name: Upload logs on failure
+        if: failure() || cancelled()
+        uses: actions/upload-artifact@v7
+        with:
+          name: e2e-mlx-logs
+          path: e2e-logs/
+          retention-days: 7
+          if-no-files-found: ignore
@@ -33,6 +33,7 @@
     Runtime,
     WorkerType,
     get_runtime,
+    is_mlx,
     is_sglang,
     is_trtllm,
     is_vllm,
@@ -111,6 +112,7 @@
     "is_vllm",
     "is_sglang",
     "is_trtllm",
+    "is_mlx",
     # Port utilities
     "get_open_port",
     "release_port",

@@ -25,6 +25,7 @@ class Runtime(StrEnum):
     SGLANG = "sglang"
     VLLM = "vllm"
     TRTLLM = "trtllm"
+    MLX = "mlx"
     OPENAI = "openai"
     XAI = "xai"
     GEMINI = "gemini"
@@ -33,7 +34,7 @@ class Runtime(StrEnum):
 
 # Convenience sets
 LOCAL_MODES = frozenset({ConnectionMode.HTTP, ConnectionMode.GRPC})
-LOCAL_RUNTIMES = frozenset({Runtime.SGLANG, Runtime.VLLM, Runtime.TRTLLM})
+LOCAL_RUNTIMES = frozenset({Runtime.SGLANG, Runtime.VLLM, Runtime.TRTLLM, Runtime.MLX})
 CLOUD_RUNTIMES = frozenset({Runtime.OPENAI, Runtime.XAI, Runtime.GEMINI, Runtime.ANTHROPIC})
 
 # Fixture parameter names (used in @pytest.mark.parametrize)
@@ -100,11 +101,21 @@ def is_trtllm() -> bool:
     return get_runtime() == "trtllm"
 
 
+def is_mlx() -> bool:
+    """Check if tests are running with MLX runtime (Apple Silicon only).
+
+    Returns:
+        True if E2E_RUNTIME is "mlx", False otherwise.
+    """
+    return get_runtime() == "mlx"
+
+
 # Runtime display labels
 RUNTIME_LABELS = {
     "sglang": "SGLang",
     "vllm": "vLLM",
     "trtllm": "TensorRT-LLM",
+    "mlx": "MLX",
 }
 
 ENV_SHOW_ROUTER_LOGS = "SHOW_ROUTER_LOGS"

@@ -184,6 +184,16 @@ def _resolve_model_path(hf_path: str) -> str:
             "--enable-chunked-prefill",
         ],
     },
+    # ── MLX models (Apple Silicon only) ──────────────────────────────────────
+    # Smallest Qwen3 with native tool calling + thinking mode (~400 MB).
+    # Used by CI on macos-latest runners. Qwen3 emits <tool_call> tags
+    # parsed by SMG's --tool-call-parser qwen, and uses <think> tags
+    # parsed by the reasoning parser.
+    "mlx-community/Qwen3-0.6B-4bit": {
+        "model": _resolve_model_path("mlx-community/Qwen3-0.6B-4bit"),
+        "tp": 1,
+        "features": ["chat", "streaming", "function_calling", "reasoning", "thinking"],
+    },
 }
 
 

@@ -178,6 +178,8 @@ def _build_cmd(self) -> list[str]:
                 return self._build_vllm_http_cmd(model_path, tp_size, spec)
         elif self.engine == "trtllm":
             return self._build_trtllm_cmd(model_path, tp_size, spec)
+        elif self.engine == "mlx":
+            return self._build_mlx_cmd(model_path, spec)
         else:
             raise ValueError(f"Unsupported engine: {self.engine}")
 
@@ -261,6 +263,30 @@ def _build_vllm_base_cmd(
             cmd.extend(extra)
         return cmd
 
+    def _build_mlx_cmd(self, model_path: str, spec: dict) -> list[str]:
+        """Build MLX gRPC server command (Apple Silicon only).
+
+        MLX backend only supports gRPC mode (no HTTP variant) since the
+        servicer wraps mlx-lm's BatchGenerator behind the MlxEngine proto.
+        """
+        if self.mode != ConnectionMode.GRPC:
+            raise ValueError("MLX backend only supports gRPC mode")
+        cmd = [
+            "python3",
+            "-m",
+            "smg_grpc_servicer.mlx.server",
+            "--model",
+            model_path,
+            "--host",
+            DEFAULT_HOST,
+            "--port",
+            str(self.port),
+        ]
+        extra = spec.get("mlx_args", [])
+        if extra:
+            cmd.extend(extra)
+        return cmd
+
     def _build_trtllm_cmd(self, model_path: str, tp_size: int, spec: dict) -> list[str]:
         """Build TensorRT-LLM gRPC server command."""
         # Create config file to enable xgrammar guided decoding