[ET-VK][quantized] Select dq8ca zero-point binding by its allocated dtype

ssjia · ssjia · commit 8f409a5770c2 · 2026-06-25T21:39:43.000-07:00
Pull Request resolved: #20491 The per-token dynamic-activation-quant (`dq8ca`) zero-point image must be bound in the shader with the same dtype the tensor was allocated with; a binding-vs-allocation dtype mismatch corrupts the per-token zero-point. The allocation dtype differs by export path: standard `export_llama -qmode 8da4w` models (e.g. Qwen3-0.6B) serialize the zero-point as `int8`, while the Llama4-mini TISO backbone (torchao `per_token_dynamic_quant` / `Int8DynamicActivationIntxWeightConfig` with an explicit fp32 `zero_point_dtype`) serializes it as float, which `vulkan_graph_builder.get_effective_dtype` downcasts to `half` under `force_fp16`. A single fixed binding dtype cannot satisfy both paths. Binding the zero-point as `int8` (`rgba8i`) corrupts the float-allocated TISO zero-point on ARM Mali (Valhall) -- negative values come back as garbage, garbling the 8da4w TTS backbone. Conversely, binding it as the codegen `DTYPE` (matching the scale's float dtype) corrupts the int8-allocated zero-point: under fp16 inference the `rgba8i` image is read and written as `rgba16f`, saturating the per-token zero-point to the int8 floor/ceiling and garbling standard fp16 8da4w models such as Qwen3-0.6B. This change makes the zero-point binding a codegen variant so it always matches the tensor's allocation. A new `ZP_DTYPE_MODE` axis emits two variants of every dq8ca shader that binds the per-token zero-point: `zpint8` (binding declared `int8`, an `rgba8i` integer image) and `zpinherit` (binding declared with the codegen `DTYPE`, inheriting the inference float dtype to match the scale -- `rgba32f`, or `rgba16f` under `USE_VULKAN_FP16_INFERENCE`). The C++ shader pickers select the variant from `graph.dtype_of(zero_point)` (`kChar` -> `zpint8`; `kHalf` / `kFloat` -> `zpinherit`), so the shader binding matches the tensor's allocation regardless of how the model was exported. The shared read helper is unchanged: `ivec4(texelFetch(t_int8_input_zps, ...))` already reads both an integer image (identity) and a float image (exact truncation of the integer-valued zero-point in `[-128, 127]`). Affected shaders: `choose_qparams_per_row` (writes the zero-point, storing `ivec4` or `VEC4_T` per variant), `quantize_and_pack_4h4w_with_group_sums`, `linear_dq8ca_q4gsw_tiled`, and the dq8ca `linear_q4gsw_coop` variants (read the zero-point). This fixes the fp16 8da4w regression for standard int8 zero-point exports while preserving the float zero-point path that the TISO backbone and the original Mali fix depend on. Only the runtime shader binding changes, so existing `.pte` files are handled correctly with no re-export. Authored with Claude Code. ghstack-source-id: 397279874 @exported-using-ghexport Differential Revision: [D109595977](https://our.internmc.facebook.com/intern/diff/D109595977/)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl
@@ -30,7 +30,7 @@ layout(std430) buffer;
 #include "common.glslh"
 
 ${layout_declare_tensor(B, "w", "t_scales", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "w", "t_zps", "int8", "texture3d")}
+${layout_declare_tensor(B, "w", "t_zps", "int8" if ZP_DTYPE_MODE == "zpint8" else DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "t_input", DTYPE, STORAGE, is_scalar_array=False)}
 
 ${layout_declare_ubo(B, "ivec4", "input_sizes")}
@@ -196,7 +196,10 @@ void main() {
 
   if (worker_id == 0) {
     imageStore(t_scales, ivec3(output_y4, 0, 0), scales_out);
-    imageStore(t_zps, ivec3(output_y4, 0, 0), zps_out);
+    $if ZP_DTYPE_MODE == "zpint8":
+      imageStore(t_zps, ivec3(output_y4, 0, 0), zps_out);
+    $else:
+      imageStore(t_zps, ivec3(output_y4, 0, 0), VEC4_T(zps_out));
   }
 
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.yaml b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.yaml
@@ -8,12 +8,16 @@ choose_qparams_per_row:
   parameter_names_with_default_values:
     DTYPE: float
     STORAGE: texture3d
+    ZP_DTYPE_MODE: zpint8
   generate_variant_forall:
     STORAGE:
       - VALUE: texture3d
       - VALUE: buffer
     DTYPE:
       - VALUE: float
       - VALUE: half
+    ZP_DTYPE_MODE:
+      - VALUE: zpint8
+      - VALUE: zpinherit
   shader_variants:
     - NAME: choose_qparams_per_row
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl
@@ -46,7 +46,7 @@ ${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=Fa
 ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INT8_INPUT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_int8_input_sums", "int", "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_int8_input_scales", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8", "texture3d")}
+${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8" if ZP_DTYPE_MODE == "zpint8" else DTYPE, "texture3d")}
 ${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.yaml
@@ -13,10 +13,14 @@ linear_dq8ca_q4gsw_tiled:
     TILE_M4: 1
     TILE_K4: 1
     TILE_N8: 1
+    ZP_DTYPE_MODE: zpint8
   generate_variant_forall:
     DTYPE:
       - VALUE: float
       - VALUE: half
+    ZP_DTYPE_MODE:
+      - VALUE: zpint8
+      - VALUE: zpinherit
   shader_variants:
     - NAME: linear_dq8ca_q4gsw_tiled_texture3d_texture2d
     - NAME: linear_dq8ca_q4gsw_tiled_texture3d_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_scales_zps_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_scales_zps_load.glslh
@@ -20,7 +20,8 @@ void load_int8_input_scales_and_zps(
   [[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) {
     scales.data[m4] =
         VEC4_T(texelFetch(t_int8_input_scales, ivec3(m4_start + m4, 0, 0), 0));
-    zps.data[m4] = texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0);
+    zps.data[m4] =
+        ivec4(texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0));
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl
@@ -40,7 +40,7 @@ $if DYNAMIC_QUANT_VARIANT:
   ${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INPUT_STORAGE, is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_int_input_sums", "int", "buffer", is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_input_scale", DTYPE, "texture3d")}
-  ${layout_declare_tensor(B, "r", "t_input_zp", "int", "texture3d")}
+  ${layout_declare_tensor(B, "r", "t_input_zp", "int8" if ZP_DTYPE_MODE == "zpint8" else DTYPE, "texture3d")}
   ${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
   ${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.yaml
@@ -15,6 +15,7 @@ linear_q4gsw_coop:
     TILE_N8: 1
     WGS: 64
     DYNAMIC_QUANT_VARIANT: false
+    ZP_DTYPE_MODE: zpint8
   generate_variant_forall:
     DTYPE:
       - VALUE: float
@@ -30,14 +31,42 @@ linear_q4gsw_coop:
       WEIGHT_STORAGE: buffer
     - NAME: linear_dq8ca_q4gsw_coop_texture3d_texture2d
       DYNAMIC_QUANT_VARIANT: true
+      generate_variant_forall:
+        DTYPE:
+          - VALUE: float
+          - VALUE: half
+        ZP_DTYPE_MODE:
+          - VALUE: zpint8
+          - VALUE: zpinherit
     - NAME: linear_dq8ca_q4gsw_coop_texture3d_buffer
       WEIGHT_STORAGE: buffer
       DYNAMIC_QUANT_VARIANT: true
+      generate_variant_forall:
+        DTYPE:
+          - VALUE: float
+          - VALUE: half
+        ZP_DTYPE_MODE:
+          - VALUE: zpint8
+          - VALUE: zpinherit
     - NAME: linear_dq8ca_q4gsw_coop_buffer_texture2d
       IO_STORAGE: buffer
       WEIGHT_STORAGE: texture2d
       DYNAMIC_QUANT_VARIANT: true
+      generate_variant_forall:
+        DTYPE:
+          - VALUE: float
+          - VALUE: half
+        ZP_DTYPE_MODE:
+          - VALUE: zpint8
+          - VALUE: zpinherit
     - NAME: linear_dq8ca_q4gsw_coop_buffer_buffer
       IO_STORAGE: buffer
       WEIGHT_STORAGE: buffer
       DYNAMIC_QUANT_VARIANT: true
+      generate_variant_forall:
+        DTYPE:
+          - VALUE: float
+          - VALUE: half
+        ZP_DTYPE_MODE:
+          - VALUE: zpint8
+          - VALUE: zpinherit
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4h4w_with_group_sums.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4h4w_with_group_sums.glsl
@@ -33,7 +33,7 @@ ${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is
 ${layout_declare_tensor(B, "w", "t_int8_input_sums", "int", "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_int8_input_scales", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8", "texture3d")}
+${layout_declare_tensor(B, "r", "t_int8_input_zps", "int8" if ZP_DTYPE_MODE == "zpint8" else DTYPE, "texture3d")}
 
 ${layout_declare_ubo(B, "ivec4", "input_sizes")}
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4h4w_with_group_sums.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4h4w_with_group_sums.yaml
@@ -11,10 +11,14 @@ quantize_and_pack_4h4w_with_group_sums:
     INPUT_STORAGE: texture3d
     NUM_GROUPS_PER_WG: 2
     NUM_WORKERS_PER_GROUP: 32
+    ZP_DTYPE_MODE: zpint8
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
+    ZP_DTYPE_MODE:
+      - VALUE: zpint8
+      - VALUE: zpinherit
   shader_variants:
     - NAME: quantize_and_pack_4h4w_with_group_sums_o2w32_buffer_texture3d
     - NAME: quantize_and_pack_4h4w_with_group_sums_o2w32_buffer_buffer
diff --git a/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp b/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp
@@ -41,10 +41,12 @@ vkapi::ShaderInfo pick_choose_qparams_per_row_shader(
   (void)resize_args;
 
   const ValueRef input = args.at(1).refs.at(0);
+  const ValueRef input_zps = args.at(0).refs.at(1);
 
   std::string kernel_name = "choose_qparams_per_row";
   add_storage_type_suffix(kernel_name, graph->storage_type_of(input));
   add_dtype_suffix(kernel_name, graph->dtype_of(input));
+  add_zp_dtype_mode_suffix(kernel_name, graph->dtype_of(input_zps));
 
   return VK_KERNEL_FROM_STR(kernel_name);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp
@@ -66,6 +66,7 @@ vkapi::ShaderInfo pick_quantize_and_pack_4h4w_with_group_sums_shader(
     const std::vector<ValueRef>& resize_args) {
   const ValueRef packed_int_input = args.at(0).refs.at(0);
   const ValueRef fp_input = args.at(1).refs.at(0);
+  const ValueRef packed_input_zps = args.at(1).refs.at(2);
   const ValueRef group_size = resize_args.at(0);
 
   const int64_t group_size_val = graph->extract_scalar<int64_t>(group_size);
@@ -81,6 +82,7 @@ vkapi::ShaderInfo pick_quantize_and_pack_4h4w_with_group_sums_shader(
       shader_name, graph->storage_type_of(packed_int_input));
   add_storage_type_suffix(shader_name, graph->storage_type_of(fp_input));
   add_dtype_suffix(shader_name, graph->dtype_of(fp_input));
+  add_zp_dtype_mode_suffix(shader_name, graph->dtype_of(packed_input_zps));
 
   return VK_KERNEL_FROM_STR(shader_name);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -145,6 +145,7 @@ vkapi::ShaderInfo pick_linear_dqa_qw_shader(
   const ValueRef fp_input = args.at(1).refs.at(0);
   const ValueRef int_input = args.at(1).refs.at(1);
   (void)int_input;
+  const ValueRef input_zp = args.at(1).refs.at(4);
   const ValueRef int_weight = args.at(1).refs.at(5);
 
   const bool weight_is_4bit = resize_args.at(0) != kDummyValueRef;
@@ -165,6 +166,7 @@ vkapi::ShaderInfo pick_linear_dqa_qw_shader(
   add_storage_type_suffix(kernel_name, graph->storage_type_of(out));
   add_storage_type_suffix(kernel_name, graph->storage_type_of(int_weight));
   add_dtype_suffix(kernel_name, graph->dtype_of(out));
+  add_zp_dtype_mode_suffix(kernel_name, graph->dtype_of(input_zp));
 
   return VK_KERNEL_FROM_STR(kernel_name);
 }
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
@@ -72,6 +72,22 @@ void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype) {
   }
 }
 
+void add_zp_dtype_mode_suffix(
+    std::string& kernel_name,
+    const vkapi::ScalarType zp_dtype) {
+  switch (zp_dtype) {
+    case vkapi::kChar:
+      kernel_name += "_zpint8";
+      break;
+    case vkapi::kHalf:
+    case vkapi::kFloat:
+      kernel_name += "_zpinherit";
+      break;
+    default:
+      VK_THROW("Unsupported per-token zero-point dtype for dq8ca");
+  }
+}
+
 void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim) {
   switch (packed_dim) {
     case WHCN::kWidthDim:
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
@@ -22,6 +22,15 @@ void add_storage_type_suffix(
 
 void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype);
 
+// Selects the per-token zero-point shader binding variant by the dtype the
+// zero-point tensor was allocated with: "_zpint8" when the tensor is int8
+// (rgba8i integer image), "_zpinherit" when it follows the inference float
+// dtype (rgba32f/rgba16f, matching the scale). Matches the ZP_DTYPE_MODE
+// codegen axis used by the dq8ca qparams shaders.
+void add_zp_dtype_mode_suffix(
+    std::string& kernel_name,
+    const vkapi::ScalarType zp_dtype);
+
 void add_ndim_suffix(std::string& kernel_name, const size_t ndim);
 
 void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim);

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,8 @@ void load_int8_input_scales_and_zps(`
`20`	`20`	`[[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) {`
`21`	`21`	`scales.data[m4] =`
`22`	`22`	`VEC4_T(texelFetch(t_int8_input_scales, ivec3(m4_start + m4, 0, 0), 0));`
`23`		`- zps.data[m4] = texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0);`
	`23`	`+ zps.data[m4] =`
	`24`	`+ ivec4(texelFetch(t_int8_input_zps, ivec3(m4_start + m4, 0, 0), 0));`
`24`	`25`	`}`
`25`	`26`	`}`
`26`	`27`