Skip to content

Commit efee6e4

Browse files
author
morelos
committed
Update base for Update on "[ET] enabling half dtype input for quantization"
Improving the cpu implementation op_quantize to support input half dtype and adding additional testing Differential Revision: [D76053764](https://our.internmc.facebook.com/intern/diff/D76053764/) [ghstack-poisoned]
2 parents 9bc8e6f + 3d3cf2a commit efee6e4

File tree

111 files changed

+2912
-6432
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

111 files changed

+2912
-6432
lines changed

.ci/scripts/build-mediatek-sdk.sh

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -eux
9+
10+
build_neuron_backend() {
11+
echo "Start building neuron backend."
12+
export ANDROID_NDK=/opt/ndk
13+
export MEDIATEK_SDK_ROOT=/tmp/neuropilot
14+
export NEURON_BUFFER_ALLOCATOR_LIB=${MEDIATEK_SDK_ROOT}/libneuron_buffer_allocator.so
15+
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
16+
17+
18+
cd ${EXECUTORCH_ROOT}
19+
./backends/mediatek/scripts/mtk_build.sh
20+
}
21+
22+
build_neuron_backend

.ci/scripts/gather_benchmark_configs.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
BENCHMARK_CONFIGS = {
3333
"xplat": [
3434
"xnnpack_q8",
35-
"hf_xnnpack_fp32",
35+
"hf_xnnpack_custom_spda_kv_cache_8da4w",
36+
"et_xnnpack_custom_spda_kv_cache_8da4w",
3637
"llama3_fb16",
3738
"llama3_spinquant",
3839
"llama3_qlora",
@@ -129,25 +130,25 @@ def generate_compatible_configs(model_name: str, target_os=None) -> List[str]:
129130
"""
130131
configs = []
131132
if is_valid_huggingface_model_id(model_name):
133+
configs.append("hf_xnnpack_custom_spda_kv_cache_8da4w")
132134
if model_name.startswith("meta-llama/"):
133-
# LLaMA models
135+
# etLLM recipes for Llama
134136
repo_name = model_name.split("meta-llama/")[1]
135137
if "qlora" in repo_name.lower():
136-
configs.append("llama3_qlora")
138+
configs = ["llama3_qlora"]
137139
elif "spinquant" in repo_name.lower():
138-
configs.append("llama3_spinquant")
140+
configs = ["llama3_spinquant"]
139141
else:
140-
configs.append("llama3_fb16")
142+
configs.extend(["llama3_fb16", "et_xnnpack_custom_spda_kv_cache_8da4w"])
141143
configs.extend(
142144
[
143145
config
144146
for config in BENCHMARK_CONFIGS.get(target_os, [])
145147
if config.startswith("llama")
146148
]
147149
)
148-
else:
149-
# Non-LLaMA models
150-
configs.append("hf_xnnpack_fp32")
150+
if model_name.startswith("Qwen/Qwen3"):
151+
configs.append("et_xnnpack_custom_spda_kv_cache_8da4w")
151152
elif model_name in MODEL_NAME_TO_MODEL:
152153
# ExecuTorch in-tree non-GenAI models
153154
configs.append("xnnpack_q8")

.ci/scripts/setup-mediatek-deps.sh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -eux
9+
10+
MEDIATEK_INSTALLATION_DIR=/tmp/neuropilot
11+
EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
12+
13+
install_neuropilot() {
14+
echo "Start installing neuropilot."
15+
mkdir -p "${MEDIATEK_INSTALLATION_DIR}"
16+
17+
curl -Lo /tmp/neuropilot-express.tar.gz "https://s3.ap-southeast-1.amazonaws.com/mediatek.neuropilot.com/06302508-4c94-4bf2-9789-b0ee44e83e27.gz"
18+
echo "Finishing downloading neuropilot sdk."
19+
tar zxvf /tmp/neuropilot-express.tar.gz --strip-components=1 --directory "${MEDIATEK_INSTALLATION_DIR}"
20+
echo "Finishing unzip neuropilot sdk."
21+
22+
# Copy NP header
23+
cp ${MEDIATEK_INSTALLATION_DIR}/api/NeuronAdapter.h ${EXECUTORCH_ROOT}/backends/mediatek/runtime/include/api/
24+
25+
# Print the content for manual verification
26+
ls -lah "${MEDIATEK_INSTALLATION_DIR}"
27+
}
28+
29+
setup_neuropilot() {
30+
pip3 install -r ${EXECUTORCH_ROOT}/backends/mediatek/requirements.txt
31+
pip3 install ${MEDIATEK_INSTALLATION_DIR}/mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
32+
pip3 install ${MEDIATEK_INSTALLATION_DIR}/mtk_converter-8.13.0_public_packages/mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
33+
}
34+
35+
setup_calibration_data() {
36+
curl -Lo /tmp/imagenette2-160.tgz https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz
37+
tar zxvf /tmp/imagenette2-160.tgz --strip-components=1 --directory "${MEDIATEK_INSTALLATION_DIR}"
38+
}
39+
40+
install_neuropilot
41+
setup_neuropilot
42+
setup_calibration_data

.ci/scripts/test_llava.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ run_and_verify() {
147147

148148
# verify result.txt
149149
RESULT=$(cat result.txt)
150-
EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. "
150+
EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with"
151151

152152
if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
153153
echo "Expected result prefix: ${EXPECTED_PREFIX}"

.ci/scripts/test_model.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,24 @@ test_model_with_mps() {
244244
EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
245245
}
246246

247+
test_model_with_mediatek() {
248+
if [[ "${MODEL_NAME}" == "dl3" ]]; then
249+
EXPORT_SCRIPT=deeplab_v3
250+
elif [[ "${MODEL_NAME}" == "mv3" ]]; then
251+
EXPORT_SCRIPT=mobilenet_v3
252+
elif [[ "${MODEL_NAME}" == "mv2" ]]; then
253+
EXPORT_SCRIPT=mobilenet_v2
254+
elif [[ "${MODEL_NAME}" == "ic4" ]]; then
255+
EXPORT_SCRIPT=inception_v4
256+
elif [[ "${MODEL_NAME}" == "ic3" ]]; then
257+
EXPORT_SCRIPT=inception_v3
258+
fi
259+
260+
PYTHONPATH=examples/mediatek/ "${PYTHON_EXECUTABLE}" -m examples.mediatek.model_export_scripts.${EXPORT_SCRIPT} -d /tmp/neuropilot/train -a ${EXPORT_SCRIPT}
261+
EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "*.pte" -print -quit)
262+
}
263+
264+
247265
if [[ "${BACKEND}" == "portable" ]]; then
248266
echo "Testing ${MODEL_NAME} with portable kernels..."
249267
test_model
@@ -281,6 +299,12 @@ elif [[ "${BACKEND}" == *"xnnpack"* ]]; then
281299
if [[ $? -eq 0 ]]; then
282300
prepare_artifacts_upload
283301
fi
302+
elif [[ "${BACKEND}" == "mediatek" ]]; then
303+
echo "Testing ${MODEL_NAME} with mediatek..."
304+
test_model_with_mediatek
305+
if [[ $? -eq 0 ]]; then
306+
prepare_artifacts_upload
307+
fi
284308
else
285309
set +e
286310
if [[ "${BACKEND}" == *"quantization"* ]]; then

.ci/scripts/tests/test_gather_benchmark_configs.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,24 @@ def test_generate_compatible_configs_llama_model(self):
112112
result = self.gather_benchmark_configs.generate_compatible_configs(
113113
model_name, target_os
114114
)
115-
expected = ["llama3_fb16", "llama3_coreml_ane"]
116-
self.assertEqual(result, expected)
115+
expected = [
116+
"llama3_fb16",
117+
"llama3_coreml_ane",
118+
"et_xnnpack_custom_spda_kv_cache_8da4w",
119+
"hf_xnnpack_custom_spda_kv_cache_8da4w",
120+
]
121+
self.assertCountEqual(result, expected)
117122

118123
target_os = "android"
119124
result = self.gather_benchmark_configs.generate_compatible_configs(
120125
model_name, target_os
121126
)
122-
expected = ["llama3_fb16"]
123-
self.assertEqual(result, expected)
127+
expected = [
128+
"llama3_fb16",
129+
"et_xnnpack_custom_spda_kv_cache_8da4w",
130+
"hf_xnnpack_custom_spda_kv_cache_8da4w",
131+
]
132+
self.assertCountEqual(result, expected)
124133

125134
def test_generate_compatible_configs_quantized_llama_model(self):
126135
model_name = "meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8"

.ci/scripts/unittest-buck2.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ buck2 query "//backends/apple/... + //backends/example/... + \
1515
//kernels/optimized/... + //kernels/portable/... + //kernels/quantized/... + \
1616
//kernels/test/... + //runtime/... + //schema/... + //test/... + //util/..."
1717

18+
# TODO: optimized ops are unbuildable because they now use ATen; put
19+
# them back after we can use PyTorch in OSS buck.
1820
UNBUILDABLE_OPTIMIZED_OPS_REGEX="_elu|gelu|fft|log_softmax"
19-
BUILDABLE_OPTIMIZED_OPS=$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
21+
BUILDABLE_OPTIMIZED_OPS= #$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
2022

2123
# TODO: build prim_ops_test_cpp again once supported_features works in
2224
# OSS buck.

.github/workflows/android-perf-private-device-experiment.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ on:
1818
description: Models to be benchmarked
1919
required: false
2020
type: string
21-
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
21+
default: Qwen/Qwen3-0.6B
2222
devices:
2323
description: Target devices to run benchmark
2424
required: false
@@ -34,7 +34,7 @@ on:
3434
description: Models to be benchmarked
3535
required: false
3636
type: string
37-
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
37+
default: Qwen/Qwen3-0.6B
3838
devices:
3939
description: Target devices to run benchmark
4040
required: false
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
60+
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
6161
devices: samsung_galaxy_s22_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/android-perf.yml

Lines changed: 86 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@ on:
66
pull_request:
77
paths:
88
- .github/workflows/android-perf.yml
9+
- .ci/scripts/gather_benchmark_configs.py
910
- extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
1011
push:
1112
branches:
1213
- main
1314
paths:
1415
- .github/workflows/android-perf.yml
16+
- .ci/scripts/gather_benchmark_configs.py
1517
- extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
1618
# Note: GitHub has an upper limit of 10 inputs
1719
workflow_dispatch:
@@ -20,7 +22,7 @@ on:
2022
description: Models to be benchmarked
2123
required: false
2224
type: string
23-
default: llama
25+
default: Qwen/Qwen3-0.6B
2426
devices:
2527
description: Target devices to run benchmark
2628
required: false
@@ -36,7 +38,7 @@ on:
3638
description: Models to be benchmarked
3739
required: false
3840
type: string
39-
default: llama
41+
default: Qwen/Qwen3-0.6B
4042
devices:
4143
description: Target devices to run benchmark
4244
required: false
@@ -70,7 +72,7 @@ jobs:
7072
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7173
# during scheduled runs and to provide flexibility for different defaults between
7274
# on-demand and periodic benchmarking.
73-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
75+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
7476
CRON_DEFAULT_DEVICES: samsung_galaxy_s22
7577
run: |
7678
set -eux
@@ -201,8 +203,8 @@ jobs:
201203
HF_MODEL_REPO=${{ matrix.model }}
202204
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
203205
206+
# Convert HF checkpoint to ET via etLLM path
204207
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
205-
# Llama models on Hugging Face
206208
if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
207209
# SpinQuant
208210
# Download prequantized chceckpoint from Hugging Face
@@ -272,6 +274,21 @@ jobs:
272274
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
273275
--output_name="${OUT_ET_MODEL_NAME}.pte"
274276
ls -lh "${OUT_ET_MODEL_NAME}.pte"
277+
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
278+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
279+
python -m examples.models.llama.export_llama \
280+
--model llama3_2 \
281+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
282+
--params "${DOWNLOADED_PATH}/params.json" \
283+
-kv \
284+
--use_sdpa_with_kv_cache \
285+
-d fp32 \
286+
-X \
287+
--xnnpack-extended-ops \
288+
-qmode 8da4w -G 32 -E 8,0 \
289+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
290+
--output_name="${OUT_ET_MODEL_NAME}.pte"
291+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
275292
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
276293
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
277294
export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
@@ -292,21 +309,75 @@ jobs:
292309
OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
293310
find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
294311
ls -lh "${OUT_ET_MODEL_NAME}.pte"
295-
else
296-
# By default, test with the Hugging Face model and the xnnpack recipe
297-
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
298-
python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
299-
ls -lh "${OUT_ET_MODEL_NAME}.pte"
300312
fi
301-
else
302-
echo "Unsupported model ${{ matrix.model }}"
303-
exit 1
313+
elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
314+
if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
315+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
316+
python -m examples.models.llama.export_llama \
317+
--model qwen3-0_6b \
318+
--params examples/models/qwen3/0_6b_config.json \
319+
-kv \
320+
--use_sdpa_with_kv_cache \
321+
-d fp32 \
322+
-X \
323+
--xnnpack-extended-ops \
324+
-qmode 8da4w \
325+
-G 32 \
326+
-E 8,0 \
327+
--metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
328+
--output_name="${OUT_ET_MODEL_NAME}.pte"
329+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
330+
fi
331+
fi
332+
333+
if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
334+
DOWNLOADED_PATH=$(
335+
bash .ci/scripts/download_hf_hub.sh \
336+
--model_id "${HF_MODEL_REPO}" \
337+
--files "tokenizer.json"
338+
)
339+
echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
340+
341+
# Install optimum-executorch
342+
git clone https://github.com/huggingface/optimum-executorch
343+
pushd optimum-executorch
344+
# There is no release yet, for CI stability, always test from the same commit on main
345+
git checkout 1c653dc49812fc431a22312c7295d97005d22e12
346+
python install_dev.py
347+
pip list
348+
349+
ARGS=(
350+
"--model" "${HF_MODEL_REPO}"
351+
"--task" "text-generation"
352+
"--recipe" "xnnpack"
353+
"--use_custom_sdpa"
354+
"--qlinear"
355+
"--qembedding"
356+
"--output_dir" ".."
357+
)
358+
359+
# Add conditional arguments based on model
360+
case "${HF_MODEL_REPO}" in
361+
*"google/gemma-3-1b-it"*)
362+
echo "--use_custom_kv_cache can not be used for HybridCache"
363+
;;
364+
*)
365+
ARGS+=("--use_custom_kv_cache")
366+
;;
367+
esac
368+
369+
optimum-cli export executorch "${ARGS[@]}"
370+
popd
371+
372+
mv model.pte ${OUT_ET_MODEL_NAME}.pte
373+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
304374
fi
305375
306-
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
376+
zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
307377
ls -lh model.zip
308-
mkdir -p "${ARTIFACTS_DIR_NAME}"
309-
mv model.zip "${ARTIFACTS_DIR_NAME}"
378+
mkdir -p ${ARTIFACTS_DIR_NAME}
379+
mv model.zip ${ARTIFACTS_DIR_NAME}
380+
ls -lh ${ARTIFACTS_DIR_NAME}
310381
elif [[ ${{ matrix.model }} == "llama" ]]; then
311382
# Install requirements for export_llama
312383
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh

0 commit comments

Comments
 (0)