Skip to content

Fix missing check (#19340) #11461

Fix missing check (#19340)

Fix missing check (#19340) #11461

Workflow file for this run

name: Test Metal Backend
on:
push:
branches:
- main
- release/*
tags:
- ciflow/metal/*
pull_request:
paths:
- .github/workflows/metal.yml
- backends/apple/metal/**
- backends/aoti/**
- examples/models/qwen3_5_moe/**
- extension/llm/export/**
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
jobs:
test-metal-builds:
name: test-executorch-metal-build
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
default-packages: ""
runner: macos-m2-stable
python-version: '3.11'
submodules: 'recursive'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
script: |
set -eux
echo "::group::Test ExecuTorch Metal build"
PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_METAL=ON" ${CONDA_RUN} --no-capture-output ./install_executorch.sh
echo "::endgroup::"
test-metal-modules:
name: test-metal-backend-modules
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
default-packages: ""
runner: macos-m2-stable
python-version: '3.11'
submodules: 'recursive'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 120
script: |
set -eux
echo "::group::Setup ExecuTorch"
PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
echo "::endgroup::"
echo "::group::Build Metal Runtime"
${CONDA_RUN} backends/apple/metal/tests/run_metal_test.sh --build
echo "::endgroup::"
echo "::group::Run Metal Backend Module Tests"
${CONDA_RUN} python -m unittest backends.apple.metal.tests.test_modules.TestMetalBackendModules
echo "::endgroup::"
test-metal-qwen35-moe-tiny:
name: test-metal-qwen35-moe-tiny
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
default-packages: ""
runner: macos-m2-stable
python-version: '3.11'
submodules: 'recursive'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 120
script: |
set -eux
echo "::group::Setup ExecuTorch"
PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
echo "::endgroup::"
# Isolate Inductor cache per job to prevent PCH conflicts
export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX")
export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX")
echo "::group::Export Qwen 3.5 MoE (tiny model, Metal)"
${CONDA_RUN} python -m executorch.examples.models.qwen3_5_moe.export \
--tiny-test \
--backend metal \
--qlinear fpa4w \
--output-dir /tmp/qwen35_moe_metal_tiny
echo "::endgroup::"
echo "::group::Build Metal runtime and Qwen 3.5 MoE runner"
${CONDA_RUN} cmake --workflow --preset llm-release-metal
cd examples/models/qwen3_5_moe
${CONDA_RUN} cmake --workflow --preset qwen3-5-moe-metal
cd -
echo "::endgroup::"
# Create a byte-level tokenizer for the tiny model (vocab_size=256).
# Maps each byte value to its own token ID so any prompt produces valid IDs.
${CONDA_RUN} python - <<'PY'
import json
vocab = {chr(i) if 32 <= i < 127 else f'<0x{i:02X}>': i for i in range(256)}
tokenizer = {
'version': '1.0',
'model': {'type': 'BPE', 'vocab': vocab, 'merges': []},
'added_tokens': [{'id': i, 'content': chr(i) if 32 <= i < 127 else f'<0x{i:02X}>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': False, 'special': False} for i in range(256)],
}
with open('/tmp/qwen35_moe_metal_tiny/tokenizer.json', 'w') as f:
json.dump(tokenizer, f)
print('Created byte-level tokenizer.json')
PY
RUNNER=./cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner
# Patch absolute libomp install name to rpath-based lookup (same as test_model_e2e.sh)
if otool -L "$RUNNER" | grep -q "/opt/llvm-openmp/lib/libomp.dylib"; then
install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER"
fi
MODEL=/tmp/qwen35_moe_metal_tiny/model.pte
TOKENIZER=/tmp/qwen35_moe_metal_tiny/tokenizer.json
echo "::group::Run Qwen 3.5 MoE inference (T=1 decode)"
# Single-char prompt → 1 token → exercises decode-only path
set +e
OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \
--prompt "A" --temperature 0 --max_new_tokens 4 2>&1)
RC=$?
set -e
echo "$OUTPUT"
if [ $RC -ne 0 ]; then
echo "Failed: runner exited with code $RC"
exit 1
fi
echo "$OUTPUT" | grep -q "Prompt tokens: 1" || { echo "Failed: expected 1 prompt token for decode path"; exit 1; }
echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: decode did not complete"; exit 1; }
echo "Success: decode completed"
echo "::endgroup::"
echo "::group::Run Qwen 3.5 MoE inference (T>2 prefill + decode)"
set +e
OUTPUT=$($RUNNER --model_path $MODEL --tokenizer_path $TOKENIZER \
--prompt "one two three" --temperature 0 --max_new_tokens 4 2>&1)
RC=$?
set -e
echo "$OUTPUT"
if [ $RC -ne 0 ]; then
echo "Failed: runner exited with code $RC"
exit 1
fi
# Byte-level tokenizer: "one two three" = 13 tokens (13 bytes)
PROMPT_TOKENS=$(echo "$OUTPUT" | grep -o "Prompt tokens: [0-9]*" | head -1 | grep -o "[0-9]*")
if [ "$PROMPT_TOKENS" -le 2 ]; then
echo "Failed: expected >2 prompt tokens for prefill path, got $PROMPT_TOKENS"
exit 1
fi
echo "$OUTPUT" | grep -q "Decode:" || { echo "Failed: prefill + decode did not complete"; exit 1; }
echo "Success: prefill ($PROMPT_TOKENS tokens) + decode completed"
echo "::endgroup::"
export-model-metal-artifact:
name: export-model-metal-artifact
# Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
strategy:
fail-fast: false
matrix:
model:
- repo: "mistralai"
name: "Voxtral-Mini-3B-2507"
- repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
- repo: "openai"
name: "whisper-small"
- repo: "openai"
name: "whisper-large-v3-turbo"
- repo: "nvidia"
name: "parakeet-tdt"
quant:
- "non-quantized"
- "quantized-int4-metal"
exclude:
# Exclude non-quantized for Voxtral Realtime (too large)
- model:
repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
quant: "non-quantized"
with:
default-packages: ""
runner: macos-m2-stable
python-version: '3.11'
submodules: 'recursive'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }}
script: |
set -eux
echo "::group::Setup Huggingface"
${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0" accelerate
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
echo "::endgroup::"
echo "::group::Setup Optimum-ExecuTorch"
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
echo "Using optimum-executorch version: ${OPTIMUM_ET_VERSION}"
${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
echo "::endgroup::"
echo "::group::Setup ExecuTorch"
PYTHON_EXECUTABLE=python ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh
echo "::endgroup::"
echo "::group::Pip List"
${CONDA_RUN} pip list
echo "::endgroup::"
# Isolate Inductor cache and precompiled headers (PCH) per job to prevent
# PCH mtime conflicts between parallel matrix jobs on the same runner.
# TORCHINDUCTOR_CACHE_DIR isolates the code cache; setting TMPDIR isolates
# the PCH dir, which PyTorch derives from tempfile.gettempdir() independently.
export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX")
export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX")
${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
test-model-metal-e2e:
name: test-model-metal-e2e
needs: export-model-metal-artifact
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
strategy:
fail-fast: false
matrix:
model:
- repo: "mistralai"
name: "Voxtral-Mini-3B-2507"
- repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
- repo: "openai"
name: "whisper-small"
- repo: "openai"
name: "whisper-large-v3-turbo"
- repo: "nvidia"
name: "parakeet-tdt"
quant:
- "non-quantized"
- "quantized-int4-metal"
exclude:
# Exclude non-quantized for Voxtral Realtime (too large)
- model:
repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
quant: "non-quantized"
with:
default-packages: ""
runner: macos-m2-stable
python-version: '3.11'
submodules: 'recursive'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-metal-${{ matrix.quant }}
script: |
set -eux
echo "::group::Print machine info"
uname -a
if [ $(uname -s) == Darwin ]; then
sw_vers
# Print RAM in GB
RAM_BYTES=$(sysctl -n hw.memsize)
RAM_GB=$(echo "scale=2; $RAM_BYTES/1024/1024/1024" | bc)
echo "Available RAM (GB): $RAM_GB"
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
# Print number of GPU cores (Apple Silicon)
if command -v system_profiler &> /dev/null; then
GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Total Number of Cores/ {print $5; exit}')
if [ -z "$GPU_CORES" ]; then
# Fallback: try to parse "Core Count" from Apple GPU section
GPU_CORES=$(system_profiler SPDisplaysDataType | awk '/Core Count/ {print $3; exit}')
fi
echo "GPU Cores: ${GPU_CORES:-Unknown}"
else
echo "system_profiler not available, cannot determine GPU cores."
fi
fi
echo "::endgroup::"
${CONDA_RUN} bash .ci/scripts/test_model_e2e.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"