[GPU] scale group correction for 3d weight requirement for onednn (#32871)

clee30 · web-flow · commit 3539f6ceb31c · 2025-11-19T14:24:39.000Z
### Details: Correct scale/zp reorder and shape for 3D weight requirement for oneDNN ### Tickets: CVS-176108
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp
@@ -527,10 +527,10 @@ static void optimize_weights_decompression_parameters(fully_connected_node& fc_n
     if (!fc_prim->compressed_weights)
         return;
 
-    auto reorder_bfyx_to_fbyx = [&](size_t dep_id) {
+    auto reorder_bfyx = [&](size_t dep_id, cldnn::format format) {
         auto& dep = fc_node.get_dependency(dep_id);
         auto target_layout = dep.get_output_layout();
-        target_layout.format = format::fbyx;
+        target_layout.format = format;
         auto reorder_prim = std::make_shared<reorder>(dep.id() + "_reorder_" + fc_node.id(), dep.id(), target_layout);
         p.add_intermediate(reorder_prim, fc_node, dep_id, true);
         fc_node.get_dependency(dep_id).recalc_output_layout(false);
@@ -563,15 +563,19 @@ static void optimize_weights_decompression_parameters(fully_connected_node& fc_n
         weight_rank = std::max(static_cast<size_t>(2), weight_rank);
     }
 
+    cldnn::format format = format::fbyx;
+    if (weight_rank == 3)
+      format = format::byfx;
+
     auto decompression_scale_idx = !fc_node.bias_term() ? 2 : 3;
     if (need_reorder(decompression_scale_idx, weight_rank)) {
-        reorder_bfyx_to_fbyx(decompression_scale_idx);
+        reorder_bfyx(decompression_scale_idx, format);
     }
 
     if (fc_prim->decompression_zero_point.is_valid()) {
         auto decompression_zp_idx = decompression_scale_idx + 1;
         if (need_reorder(decompression_zp_idx, weight_rank)) {
-            reorder_bfyx_to_fbyx(decompression_zp_idx);
+            reorder_bfyx(decompression_zp_idx, format);
         }
     }
 }
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp
@@ -77,7 +77,7 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
             } else {
                 OPENVINO_ASSERT(current_shape.size() == 4 && is_weight_3d);
                     new_shape = (has_transpose || !grouped) ? ov::Shape{current_shape[0], current_shape[1] * current_shape[2], current_shape[3]}
-                                                            : ov::Shape{current_shape[0], current_shape[1], current_shape[3] * current_shape[1]};
+                                                            : ov::Shape{current_shape[0], current_shape[1], current_shape[3] * current_shape[2]};
             }
             auto new_constant = std::make_shared<ov::op::v0::Constant>(*constant, new_shape);
 
diff --git a/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp
@@ -190,7 +190,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed6) {
         auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
         auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
         auto transpose = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_const);
-	auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
         auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, transpose, no_bias);
 
         model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
@@ -201,7 +201,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed6) {
         auto weights_const = ov::op::v0::Constant::create(ov::element::u4, ov::Shape{ 16, 32 }, { 1 });
         auto transpose_weights_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
         auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(weights_const, transpose_weights_const);
-	auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
         auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 4, 32 }, { 1 });
         auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
         auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const);
@@ -228,7 +228,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed7) {
         auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
         auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
         auto transpose = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_const);
-	auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
         auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, transpose, no_bias);
 
         model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
@@ -239,7 +239,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed7) {
         auto weights_const = ov::op::v0::Constant::create(ov::element::u4, ov::Shape{ 16, 32 }, { 1 });
         auto transpose_weights_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
         auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(weights_const, transpose_weights_const);
-	auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
         auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 4, 32 }, { 1 });
         auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
         auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const);
@@ -345,7 +345,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed8) {
         auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
         auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
         auto transpose = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_const);
-	auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
 
         auto param1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 15});
         auto const_value1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 1}, {1});
@@ -374,7 +374,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed8) {
         auto weights_const = ov::op::v0::Constant::create(ov::element::u4, ov::Shape{ 16, 32 }, { 1 });
         auto transpose_weights_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
         auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(weights_const, transpose_weights_const);
-	auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
         auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 4, 32 }, { 1 });
         auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
         auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const);
@@ -417,7 +417,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed9) {
         auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_const);
         auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
         auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
-	    auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
         auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);
 
         model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
@@ -428,7 +428,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed9) {
         auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
         auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
         auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
-	    auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
         auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, weights_const, no_bias, scale_const, zp_const);
 
         model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1});
@@ -445,7 +445,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed10) {
         auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_convert);
         auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
         auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
-	    auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
         auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);
 
         model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
@@ -456,7 +456,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed10) {
         auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
         auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
         auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 1 }, { 1 });
-	    auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
+        auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
         auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, weights_const, no_bias, scale_const, zp_const);
 
         model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1});
diff --git a/src/plugins/intel_gpu/tests/unit/transformations/convert_matmul_to_fc_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/convert_matmul_to_fc_test.cpp

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon`
`77`	`77`	`} else {`
`78`	`78`	`OPENVINO_ASSERT(current_shape.size() == 4 && is_weight_3d);`
`79`	`79`	`new_shape = (has_transpose \|\| !grouped) ? ov::Shape{current_shape[0], current_shape[1] * current_shape[2], current_shape[3]}`
`80`		`- : ov::Shape{current_shape[0], current_shape[1], current_shape[3] * current_shape[1]};`
	`80`	`+ : ov::Shape{current_shape[0], current_shape[1], current_shape[3] * current_shape[2]};`
`81`	`81`	`}`
`82`	`82`	`auto new_constant = std::make_shared<ov::op::v0::Constant>(*constant, new_shape);`
`83`	`83`