diff --git a/.ci/scripts/test_lora.sh b/.ci/scripts/test_lora.sh index 195a8ec68ed..e24eaf1f75d 100644 --- a/.ci/scripts/test_lora.sh +++ b/.ci/scripts/test_lora.sh @@ -151,13 +151,12 @@ else fi ### QUANTIZATION & PROGRAM DATA SEPARATION ### -EXPECTED_QUANT_PREFIX="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant: - +EXPECTED_QUANT_PREFIX="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant: me Okay, so I need to calculate 15% of 80." EXPECTED_QUANT_LORA_PREFIX=" <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant -To calculate 15% of 80, we can multiply 80 by 15/100 and then simplify the fraction. -So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12. +To calculate 15% of 80, we can multiply 80 by 15/100. +So, 15% of 80 is equal to 80 * 15/100 = 12. #### 12 The answer is: 12<|im_end|>" EXPECTED_QUANT_LORA_ALTERNATE_PREFIX=" @@ -169,6 +168,7 @@ So, 15% of 80 is 12. The answer is: 12<|im_end|>" + # Export Quantized PTE, PTD file, no LoRA. # override base.lora_config=null to avoid creating a lora model # and loading lora weights. @@ -228,7 +228,7 @@ fi NOW=$(date +"%H:%M:%S") echo "Test 4: Quantized, program-data separation lora. Starting to run llama runner at ${NOW}" # shellcheck source=/dev/null -cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math_q.pte --data_paths="qwen_foundation_q.ptd,qwen_lora_math_q.ptd" --prompt="${PROMPT}" ${RUNTIME_ARGS} --seq_len=104 > result.txt +cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math_q.pte --data_paths="qwen_foundation_q.ptd,qwen_lora_math_q.ptd" --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt NOW=$(date +"%H:%M:%S") echo "Finished at ${NOW}" diff --git a/.ci/scripts/test_lora_multimethod.sh b/.ci/scripts/test_lora_multimethod.sh index 7c468eb226b..bd192f85a51 100755 --- a/.ci/scripts/test_lora_multimethod.sh +++ b/.ci/scripts/test_lora_multimethod.sh @@ -85,8 +85,7 @@ EXPECTED_LORA_PREFIX=" <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant To calculate 15% of 80" -EXPECTED_BASE_PREFIX="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant: - +EXPECTED_BASE_PREFIX="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant: me Okay, so I need to calculate 15% of 80." ### TEST 1: Run lora_forward method ### diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index e3b12895926..2d3777b3324 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -412,7 +412,12 @@ Tensor& custom_sdpa_out_impl( InvalidArgument, output); - bool use_unfused_sdpa = seq_len == 1; + // Quantized GEMM kernels may not handle non-contiguous per-head strides + // correctly when seq_dim=ONE and seq_len > 1, so keep the conservative + // condition for quantized inputs. + bool is_quantized = q.scalar_type() == ScalarType::Char; + bool use_unfused_sdpa = (!is_quantized) && + (seq_len <= 128 || num_keys_for_causal_attention <= 128); if (use_unfused_sdpa) { ET_SWITCH_FLOAT_TYPES(output.scalar_type(), ctx, "sdpa", CTYPE, [&] { sdpa::impl::cpu_sdpa(