From 50f60d21f7c065617e597129d0bfff2a41f7c713 Mon Sep 17 00:00:00 2001 From: Zhang Yi3 Date: Tue, 18 Mar 2025 17:34:01 +0800 Subject: [PATCH] [CPU]Fix by_chhanel quant for avx2 --- .../src/nodes/kernels/scaled_attn/attn_quant.cpp | 4 ++-- .../src/nodes/kernels/scaled_attn/executor_pa.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp index 36775ddc88d4bb..a4b90f1929eb34 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp @@ -275,8 +275,8 @@ static void quant_u8_by_channel_kernel(const T* src, } } #endif - for (size_t i = 0; i < seq_dim; ++i) { - for (; j < hidden_dims; j++) { + for (; j < hidden_dims; j++) { + for (size_t i = 0; i < seq_dim; ++i) { float tmp = src[i * src_stride + j]; dst[i * dst_stride + j] = static_cast(std::round(tmp / scale[j] + zp[j])); } diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp index 6a85d5c2502b85..556d76b4503518 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp @@ -681,10 +681,10 @@ static void dot_product_block_by_channel(TA* a, uint8_t* b, float* c, const size auto va2 = mm256_uni_loadu_ps(a + i + vec_len_f32_avx2 * 2); auto va3 = mm256_uni_loadu_ps(a + i + vec_len_f32_avx2 * 3); - auto vb0_128 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)); - auto vb1_128 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + vec_len_f32_avx2)); - auto vb2_128 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + vec_len_f32_avx2 * 2)); - auto vb3_128 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + vec_len_f32_avx2 * 3)); + auto vb0_128 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + params_offset + i)); + auto vb1_128 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + params_offset + i + vec_len_f32_avx2)); + auto vb2_128 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + params_offset + i + vec_len_f32_avx2 * 2)); + auto vb3_128 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + params_offset + i + vec_len_f32_avx2 * 3)); auto vb0_256 = _mm256_cvtepu8_epi32(vb0_128); auto vb1_256 = _mm256_cvtepu8_epi32(vb1_128);