Benchmark optimum-executorch

Guang Yang · Guang Yang · commit fff15c629a98 · 2025-06-06T12:46:06.000-07:00
diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py
@@ -32,7 +32,8 @@
 BENCHMARK_CONFIGS = {
     "xplat": [
         "xnnpack_q8",
-        "hf_xnnpack_fp32",
+        "hf_xnnpack_custom_spda_kv_cache_8da4w",
+        "et_xnnpack_custom_spda_kv_cache_8da4w",
         "llama3_fb16",
         "llama3_spinquant",
         "llama3_qlora",
@@ -129,25 +130,26 @@ def generate_compatible_configs(model_name: str, target_os=None) -> List[str]:
     """
     configs = []
     if is_valid_huggingface_model_id(model_name):
+        configs.append("hf_xnnpack_custom_spda_kv_cache_8da4w")
         if model_name.startswith("meta-llama/"):
-            # LLaMA models
+            # etLLM recipes for Llama
             repo_name = model_name.split("meta-llama/")[1]
             if "qlora" in repo_name.lower():
                 configs.append("llama3_qlora")
             elif "spinquant" in repo_name.lower():
                 configs.append("llama3_spinquant")
             else:
                 configs.append("llama3_fb16")
+                configs.append("et_xnnpack_custom_spda_kv_cache_8da4w")
                 configs.extend(
                     [
                         config
                         for config in BENCHMARK_CONFIGS.get(target_os, [])
                         if config.startswith("llama")
                     ]
                 )
-        else:
-            # Non-LLaMA models
-            configs.append("hf_xnnpack_fp32")
+        if model_name.startswith("Qwen/Qwen3"):
+            configs.append("et_xnnpack_custom_spda_kv_cache_8da4w")
     elif model_name in MODEL_NAME_TO_MODEL:
         # ExecuTorch in-tree non-GenAI models
         configs.append("xnnpack_q8")
diff --git a/.github/workflows/android-perf-private-device-experiment.yml b/.github/workflows/android-perf-private-device-experiment.yml
@@ -18,7 +18,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
       devices:
         description: Target devices to run benchmark
         required: false
@@ -34,7 +34,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
       devices:
         description: Target devices to run benchmark
         required: false
@@ -57,6 +57,6 @@ jobs:
       id-token: write
       contents: read
     with:
-      models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
+      models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' }}
       devices: samsung_galaxy_s22_private
       benchmark_configs: ${{ inputs.benchmark_configs }}
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -70,7 +70,7 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'llama' }}
           CRON_DEFAULT_DEVICES: samsung_galaxy_s22
         run: |
           set -eux
@@ -201,8 +201,8 @@ jobs:
             HF_MODEL_REPO=${{ matrix.model }}
             OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
 
+            # Convert HF checkpoint to ET via etLLM path
             if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
-                # Llama models on Hugging Face
                 if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
                     # SpinQuant
                     # Download prequantized chceckpoint from Hugging Face
@@ -298,12 +298,38 @@ jobs:
                     python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 fi
-            else
-                echo "Unsupported model ${{ matrix.model }}"
-                exit 1
             fi
 
-            zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
+            if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+              # Install optimum-executorch
+              git clone https://github.com/huggingface/optimum-executorch
+              pushd optimum-executorch
+              # There is no release yet, for CI stability, always test from the same commit on main
+              git checkout 1c653dc49812fc431a22312c7295d97005d22e12
+              python install_dev.py
+              pip list
+
+              optimum-cli export executorch \
+                --model $HF_MODEL_REPO \
+                --task "text-generation" \
+                --recipe xnnpack \
+                --use_custom_sdpa \
+                --use_custom_kv_cache \
+                --qlinear \
+                --qembedding \
+                --output_dir "."
+
+              mv model.pte ${OUT_ET_MODEL_NAME}.pte
+              ls -lh "${OUT_ET_MODEL_NAME}.pte"
+
+              DOWNLOADED_PATH=$(
+                bash .ci/scripts/download_hf_hub.sh \
+                  --model_id "${HF_MODEL_REPO}" \
+                  --files "tokenizer.json"
+              )
+            fi
+
+            zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
             ls -lh model.zip
             mkdir -p "${ARTIFACTS_DIR_NAME}"
             mv model.zip "${ARTIFACTS_DIR_NAME}"
diff --git a/.github/workflows/apple-perf-private-device-experiment.yml b/.github/workflows/apple-perf-private-device-experiment.yml
@@ -18,7 +18,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
       devices:
         description: Target devices to run benchmark
         required: false
@@ -34,7 +34,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
       devices:
         description: Target devices to run benchmark
         required: false
@@ -57,6 +57,6 @@ jobs:
       id-token: write
       contents: read
     with:
-      models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
+      models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' }}
       devices: apple_iphone_15_private
       benchmark_configs: ${{ inputs.benchmark_configs }}
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
@@ -70,7 +70,7 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'llama' }}
           CRON_DEFAULT_DEVICES: apple_iphone_15
         run: |
           set -eux
@@ -207,6 +207,7 @@ jobs:
           HF_MODEL_REPO=${{ matrix.model }}
           OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
 
+          # Convert HF checkpoint to ET via etLLM path
           if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
             # Llama models on Hugging Face
             if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
@@ -299,12 +300,38 @@ jobs:
               ${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
               ls -lh "${OUT_ET_MODEL_NAME}.pte"
             fi
-          else
-            echo "Unsupported model ${{ matrix.model }}"
-            exit 1
           fi
 
-          zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
+          if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
+            # Install optimum-executorch
+            git clone https://github.com/huggingface/optimum-executorch
+            pushd optimum-executorch
+            # There is no release yet, for CI stability, always test from the same commit on main
+            git checkout 1c653dc49812fc431a22312c7295d97005d22e12
+            python install_dev.py
+            pip list
+
+            optimum-cli export executorch \
+              --model $HF_MODEL_REPO \
+              --task "text-generation" \
+              --recipe xnnpack \
+              --use_custom_sdpa \
+              --use_custom_kv_cache \
+              --qlinear \
+              --qembedding \
+              --output_dir "."
+
+            mv model.pte ${OUT_ET_MODEL_NAME}.pte
+            ls -lh "${OUT_ET_MODEL_NAME}.pte"
+
+            DOWNLOADED_PATH=$(
+              bash .ci/scripts/download_hf_hub.sh \
+                --model_id "${HF_MODEL_REPO}" \
+                --files "tokenizer.json"
+            )
+          fi
+
+          zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
           ls -lh model.zip
           mkdir -p "${ARTIFACTS_DIR_NAME}"
           mv model.zip "${ARTIFACTS_DIR_NAME}"
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -570,7 +570,7 @@ jobs:
         git clone https://github.com/huggingface/optimum-executorch
         pushd optimum-executorch
         # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout da80c9e35b3db5c7eea8731b7d660482fb4870a8
+        git checkout 1c653dc49812fc431a22312c7295d97005d22e12
         pip install .[tests]
         popd
 
@@ -588,16 +588,18 @@ jobs:
         echo "::group::Export to ExecuTorch"
         # Pass matrix variable as environment variable
         export MODEL_ID="${{ matrix.hf_model_id }}"
-        export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_8da4w"
+        export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_kv_cache_8da4w"
         pushd optimum-executorch
 
         optimum-cli export executorch \
           --model ${MODEL_ID} \
           --task text-generation \
           --recipe xnnpack \
           --use_custom_sdpa \
+          --use_custom_kv_cache \
+          --qlinear \
+          --qembedding \
           --output_dir ${OUTPUT_DIR} \
-          --qlinear
 
         ls -FlAGhp ${OUTPUT_DIR}
         popd