Skip to content

Commit 3539f6c

Browse files
authored
[GPU] scale group correction for 3d weight requirement for onednn (#32871)
### Details: Correct scale/zp reorder and shape for 3D weight requirement for oneDNN ### Tickets: CVS-176108
1 parent d8c7414 commit 3539f6c

File tree

4 files changed

+67
-31
lines changed

4 files changed

+67
-31
lines changed

src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -527,10 +527,10 @@ static void optimize_weights_decompression_parameters(fully_connected_node& fc_n
527527
if (!fc_prim->compressed_weights)
528528
return;
529529

530-
auto reorder_bfyx_to_fbyx = [&](size_t dep_id) {
530+
auto reorder_bfyx = [&](size_t dep_id, cldnn::format format) {
531531
auto& dep = fc_node.get_dependency(dep_id);
532532
auto target_layout = dep.get_output_layout();
533-
target_layout.format = format::fbyx;
533+
target_layout.format = format;
534534
auto reorder_prim = std::make_shared<reorder>(dep.id() + "_reorder_" + fc_node.id(), dep.id(), target_layout);
535535
p.add_intermediate(reorder_prim, fc_node, dep_id, true);
536536
fc_node.get_dependency(dep_id).recalc_output_layout(false);
@@ -563,15 +563,19 @@ static void optimize_weights_decompression_parameters(fully_connected_node& fc_n
563563
weight_rank = std::max(static_cast<size_t>(2), weight_rank);
564564
}
565565

566+
cldnn::format format = format::fbyx;
567+
if (weight_rank == 3)
568+
format = format::byfx;
569+
566570
auto decompression_scale_idx = !fc_node.bias_term() ? 2 : 3;
567571
if (need_reorder(decompression_scale_idx, weight_rank)) {
568-
reorder_bfyx_to_fbyx(decompression_scale_idx);
572+
reorder_bfyx(decompression_scale_idx, format);
569573
}
570574

571575
if (fc_prim->decompression_zero_point.is_valid()) {
572576
auto decompression_zp_idx = decompression_scale_idx + 1;
573577
if (need_reorder(decompression_zp_idx, weight_rank)) {
574-
reorder_bfyx_to_fbyx(decompression_zp_idx);
578+
reorder_bfyx(decompression_zp_idx, format);
575579
}
576580
}
577581
}

src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
7777
} else {
7878
OPENVINO_ASSERT(current_shape.size() == 4 && is_weight_3d);
7979
new_shape = (has_transpose || !grouped) ? ov::Shape{current_shape[0], current_shape[1] * current_shape[2], current_shape[3]}
80-
: ov::Shape{current_shape[0], current_shape[1], current_shape[3] * current_shape[1]};
80+
: ov::Shape{current_shape[0], current_shape[1], current_shape[3] * current_shape[2]};
8181
}
8282
auto new_constant = std::make_shared<ov::op::v0::Constant>(*constant, new_shape);
8383

src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed6) {
190190
auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
191191
auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
192192
auto transpose = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_const);
193-
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
193+
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
194194
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, transpose, no_bias);
195195

196196
model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
@@ -201,7 +201,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed6) {
201201
auto weights_const = ov::op::v0::Constant::create(ov::element::u4, ov::Shape{ 16, 32 }, { 1 });
202202
auto transpose_weights_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
203203
auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(weights_const, transpose_weights_const);
204-
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
204+
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
205205
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 4, 32 }, { 1 });
206206
auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
207207
auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const);
@@ -228,7 +228,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed7) {
228228
auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
229229
auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
230230
auto transpose = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_const);
231-
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
231+
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
232232
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, transpose, no_bias);
233233

234234
model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
@@ -239,7 +239,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed7) {
239239
auto weights_const = ov::op::v0::Constant::create(ov::element::u4, ov::Shape{ 16, 32 }, { 1 });
240240
auto transpose_weights_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
241241
auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(weights_const, transpose_weights_const);
242-
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
242+
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
243243
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 4, 32 }, { 1 });
244244
auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
245245
auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const);
@@ -345,7 +345,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed8) {
345345
auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
346346
auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
347347
auto transpose = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_const);
348-
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
348+
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
349349

350350
auto param1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 15});
351351
auto const_value1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 1}, {1});
@@ -374,7 +374,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed8) {
374374
auto weights_const = ov::op::v0::Constant::create(ov::element::u4, ov::Shape{ 16, 32 }, { 1 });
375375
auto transpose_weights_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
376376
auto transpose_weights = std::make_shared<ov::op::v1::Transpose>(weights_const, transpose_weights_const);
377-
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
377+
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
378378
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 4, 32 }, { 1 });
379379
auto transpose_scale_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 });
380380
auto transpose_scale = std::make_shared<ov::op::v1::Transpose>(scale_const, transpose_scale_const);
@@ -417,7 +417,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed9) {
417417
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_const);
418418
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
419419
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
420-
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
420+
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
421421
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);
422422

423423
model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
@@ -428,7 +428,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed9) {
428428
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
429429
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
430430
auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
431-
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
431+
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
432432
auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, weights_const, no_bias, scale_const, zp_const);
433433

434434
model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1});
@@ -445,7 +445,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed10) {
445445
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_convert);
446446
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
447447
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
448-
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
448+
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
449449
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);
450450

451451
model = std::make_shared<ov::Model>(ov::OutputVector{fc}, ov::ParameterVector{input1});
@@ -456,7 +456,7 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed10) {
456456
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
457457
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
458458
auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 1 }, { 1 });
459-
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
459+
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
460460
auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input1, weights_const, no_bias, scale_const, zp_const);
461461

462462
model_ref = std::make_shared<ov::Model>(ov::OutputVector{fc_compressed}, ov::ParameterVector{input1});

0 commit comments

Comments
 (0)