Fix missing check (#19340) #11461
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test Metal Backend | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - release/* | |
| tags: | |
| - ciflow/metal/* | |
| pull_request: | |
| paths: | |
| - .github/workflows/metal.yml | |
| - backends/apple/metal/** | |
| - backends/aoti/** | |
| - examples/models/qwen3_5_moe/** | |
| - extension/llm/export/** | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
| cancel-in-progress: true | |
| jobs: | |
| test-metal-builds: | |
| name: test-executorch-metal-build | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| with: | |
| default-packages: "" | |
| runner: macos-m2-stable | |
| python-version: '3.11' | |
| submodules: 'recursive' | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| timeout: 90 | |
| script: | | |
| set -eux | |
| echo "::group::Test ExecuTorch Metal build" | |
| PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_executorch.sh | |
| echo "::endgroup::" | |
| test-metal-modules: | |
| name: test-metal-backend-modules | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| with: | |
| default-packages: "" | |
| runner: macos-m2-stable | |
| python-version: '3.11' | |
| submodules: 'recursive' | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| timeout: 120 | |
| script: | | |
| set -eux | |
| echo "::group::Setup ExecuTorch" | |
| PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh | |
| echo "::endgroup::" | |
| echo "::group::Build Metal Runtime" | |
| ${CONDA_RUN} backends/apple/metal/tests/run_metal_test.sh --build | |
| echo "::endgroup::" | |
| echo "::group::Run Metal Backend Module Tests" | |
| ${CONDA_RUN} python -m unittest backends.apple.metal.tests.test_modules.TestMetalBackendModules | |
| echo "::endgroup::" | |
| test-metal-qwen35-moe-tiny: | |
| name: test-metal-qwen35-moe-tiny | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| with: | |
| default-packages: "" | |
| runner: macos-m2-stable | |
| python-version: '3.11' | |
| submodules: 'recursive' | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| timeout: 120 | |
| script: | | |
| set -eux | |
| echo "::group::Setup ExecuTorch" | |
| PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh | |
| echo "::endgroup::" | |
| # Isolate Inductor cache per job to prevent PCH conflicts | |
| export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX") | |
| export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX") | |
| echo "::group::Export Qwen 3.5 MoE (tiny model, Metal)" | |
| ${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.export \ | |
| --tiny-test \ | |
| --backend metal \ | |
| --qlinear fpa4w \ | |
| --output-dir /tmp/qwen35_moe_metal_tiny | |
| echo "::endgroup::" | |
| echo "::group::Build Metal runtime and Qwen 3.5 MoE runner" | |
| ${CONDA_RUN} cmake --workflow --preset llm-release-metal | |
| cd examples/models/qwen3_5_moe | |
| ${CONDA_RUN} cmake --workflow --preset qwen3-5-moe-metal | |
| cd - | |
| echo "::endgroup::" | |
| # Create a byte-level tokenizer for the tiny model (vocab_size=256). | |
| # Maps each byte value to its own token ID so any prompt produces valid IDs. | |
| ${CONDA_RUN} python - <<'PY' | |
| import json | |
| vocab = {chr(i) if 32 <= i < 127 else f'<0x{i:02X}>': i for i in range(256)} | |
| tokenizer = { | |
| 'version': '1.0', | |
| 'model': {'type': 'BPE', 'vocab': vocab, 'merges': []}, | |
| 'added_tokens': [{'id': i, 'content': chr(i) if 32 <= i < 127 else f'<0x{i:02X}>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': False, 'special': False} for i in range(256)], | |
| } | |
| with open('/tmp/qwen35_moe_metal_tiny/tokenizer.json', 'w') as f: | |
| json.dump(tokenizer, f) | |
| print('Created byte-level tokenizer.json') | |
| PY | |
| RUNNER=./cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner | |
| # Patch absolute libomp install name to rpath-based lookup (same as test_model_e2e.sh) | |
| if otool -L "$RUNNER" | grep -q "/opt/llvm-openmp/lib/libomp.dylib"; then | |
| install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER" | |
| fi | |
| MODEL=/tmp/qwen35_moe_metal_tiny/model.pte | |
| TOKENIZER=/tmp/qwen35_moe_metal_tiny/tokenizer.json | |
| echo "::group::Run Qwen 3.5 MoE inference (T=1 decode)" | |
| # Single-char prompt → 1 token → exercises decode-only path | |
| set +e | |
| OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \ | |
| --prompt "A" --temperature 0 --max_new_tokens 4 2>&1) | |
| RC=$? | |
| set -e | |
| echo "$OUTPUT" | |
| if [ $RC -ne 0 ]; then | |
| echo "Failed: runner exited with code $RC" | |
| exit 1 | |
| fi | |
| echo "$OUTPUT" | grep -q "Prompt tokens: 1" || { echo "Failed: expected 1 prompt token for decode path"; exit 1; } | |
| echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: decode did not complete"; exit 1; } | |
| echo "Success: decode completed" | |
| echo "::endgroup::" | |
| echo "::group::Run Qwen 3.5 MoE inference (T>2 prefill + decode)" | |
| set +e | |
| OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \ | |
| --prompt "one two three" --temperature 0 --max_new_tokens 4 2>&1) | |
| RC=$? | |
| set -e | |
| echo "$OUTPUT" | |
| if [ $RC -ne 0 ]; then | |
| echo "Failed: runner exited with code $RC" | |
| exit 1 | |
| fi | |
| # Byte-level tokenizer: "one two three" = 13 tokens (13 bytes) | |
| PROMPT_TOKENS=$(echo "$OUTPUT" | grep -o "Prompt tokens: [0-9]*" | head -1 | grep -o "[0-9]*") | |
| if [ "$PROMPT_TOKENS" -le 2 ]; then | |
| echo "Failed: expected >2 prompt tokens for prefill path, got $PROMPT_TOKENS" | |
| exit 1 | |
| fi | |
| echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: prefill + decode did not complete"; exit 1; } | |
| echo "Success: prefill ($PROMPT_TOKENS tokens) + decode completed" | |
| echo "::endgroup::" | |
| export-model-metal-artifact: | |
| name: export-model-metal-artifact | |
| # Skip this job if the pull request is from a fork (HuggingFace secrets are not available) | |
| if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request' | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| secrets: inherit | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| model: | |
| - repo: "mistralai" | |
| name: "Voxtral-Mini-3B-2507" | |
| - repo: "mistralai" | |
| name: "Voxtral-Mini-4B-Realtime-2602" | |
| - repo: "openai" | |
| name: "whisper-small" | |
| - repo: "openai" | |
| name: "whisper-large-v3-turbo" | |
| - repo: "nvidia" | |
| name: "parakeet-tdt" | |
| quant: | |
| - "non-quantized" | |
| - "quantized-int4-metal" | |
| exclude: | |
| # Exclude non-quantized for Voxtral Realtime (too large) | |
| - model: | |
| repo: "mistralai" | |
| name: "Voxtral-Mini-4B-Realtime-2602" | |
| quant: "non-quantized" | |
| with: | |
| default-packages: "" | |
| runner: macos-m2-stable | |
| python-version: '3.11' | |
| submodules: 'recursive' | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| timeout: 90 | |
| secrets-env: EXECUTORCH_HF_TOKEN | |
| upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }} | |
| script: | | |
| set -eux | |
| echo "::group::Setup Huggingface" | |
| ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0" accelerate | |
| ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN | |
| echo "::endgroup::" | |
| echo "::group::Setup Optimum-ExecuTorch" | |
| OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) | |
| echo "Using optimum-executorch version: ${OPTIMUM_ET_VERSION}" | |
| ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} | |
| echo "::endgroup::" | |
| echo "::group::Setup ExecuTorch" | |
| PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh | |
| echo "::endgroup::" | |
| echo "::group::Pip List" | |
| ${CONDA_RUN} pip list | |
| echo "::endgroup::" | |
| # Isolate Inductor cache and precompiled headers (PCH) per job to prevent | |
| # PCH mtime conflicts between parallel matrix jobs on the same runner. | |
| # TORCHINDUCTOR_CACHE_DIR isolates the code cache; setting TMPDIR isolates | |
| # the PCH dir, which PyTorch derives from tempfile.gettempdir() independently. | |
| export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX") | |
| export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX") | |
| ${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" | |
| test-model-metal-e2e: | |
| name: test-model-metal-e2e | |
| needs: export-model-metal-artifact | |
| uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| model: | |
| - repo: "mistralai" | |
| name: "Voxtral-Mini-3B-2507" | |
| - repo: "mistralai" | |
| name: "Voxtral-Mini-4B-Realtime-2602" | |
| - repo: "openai" | |
| name: "whisper-small" | |
| - repo: "openai" | |
| name: "whisper-large-v3-turbo" | |
| - repo: "nvidia" | |
| name: "parakeet-tdt" | |
| quant: | |
| - "non-quantized" | |
| - "quantized-int4-metal" | |
| exclude: | |
| # Exclude non-quantized for Voxtral Realtime (too large) | |
| - model: | |
| repo: "mistralai" | |
| name: "Voxtral-Mini-4B-Realtime-2602" | |
| quant: "non-quantized" | |
| with: | |
| default-packages: "" | |
| runner: macos-m2-stable | |
| python-version: '3.11' | |
| submodules: 'recursive' | |
| ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | |
| timeout: 90 | |
| download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }} | |
| script: | | |
| set -eux | |
| echo "::group::Print machine info" | |
| uname -a | |
| if [ $(uname -s) == Darwin ]; then | |
| sw_vers | |
| # Print RAM in GB | |
| RAM_BYTES=$(sysctl -n hw.memsize) | |
| RAM_GB=$(echo "scale=2; $RAM_BYTES/1024/1024/1024" | bc) | |
| echo "Available RAM (GB): $RAM_GB" | |
| sysctl machdep.cpu.brand_string | |
| sysctl machdep.cpu.core_count | |
| # Print number of GPU cores (Apple Silicon) | |
| if command -v system_profiler &> /dev/null; then | |
| GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Total Number of Cores/ {print $5; exit}') | |
| if [ -z "$GPU_CORES" ]; then | |
| # Fallback: try to parse "Core Count" from Apple GPU section | |
| GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Core Count/ {print $3; exit}') | |
| fi | |
| echo "GPU Cores: ${GPU_CORES:-Unknown}" | |
| else | |
| echo "system_profiler not available, cannot determine GPU cores." | |
| fi | |
| fi | |
| echo "::endgroup::" | |
| ${CONDA_RUN} bash .ci/scripts/test_model_e2e.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" |