Skip to content

Commit f1b8abe

Browse files
authored
[GPU] Optimization for gemm & fc in iGPU. (openvinotoolkit#19780)
* Optimization for gemm & fc in iGPU. FC: fake alignment for 16 is better in iGPU. Gemm: permute + gemm_tiled_opt is better than transposed_input + gemm_ref kernel for unaligned shapes to 16. Note that this is an temporal optimization and will be removed once the final solution (i.e., support unaligned transposed input shape in gemm_tiled_opt kernel) is availalbe. * Fix unittest * Fix for model_cache * Fix unittest
1 parent efe5436 commit f1b8abe

File tree

8 files changed

+79
-32
lines changed

8 files changed

+79
-32
lines changed

src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ struct kernel_impl_params {
3434

3535
bool has_runtime_layouts = false;
3636
const program *prog;
37+
cldnn::device_type dev_type;
3738
stream::ptr strm;
3839
std::shared_ptr<const primitive> desc;
3940
size_t unique_id;
@@ -63,9 +64,11 @@ struct kernel_impl_params {
6364
std::vector<size_t> output_size;
6465
std::vector<size_t> img_size;
6566

66-
kernel_impl_params() : prog(nullptr), strm(nullptr), desc(nullptr), unique_id(0) {}
67+
kernel_impl_params() : prog(nullptr), dev_type(cldnn::device_type::integrated_gpu), strm(nullptr), desc(nullptr), unique_id(0) {
68+
}
6769

6870
kernel_impl_params(program& _prog,
71+
cldnn::device_type _dev_type,
6972
stream::ptr _strm,
7073
std::shared_ptr<const primitive> _desc,
7174
size_t _uid,
@@ -74,6 +77,7 @@ struct kernel_impl_params {
7477
const std::vector<cldnn::fused_primitive_desc>& _fused_descs)
7578
: has_runtime_layouts(true)
7679
, prog(&_prog)
80+
, dev_type(_dev_type)
7781
, strm(std::move(_strm))
7882
, desc(std::move(_desc))
7983
, unique_id(_uid)
@@ -135,7 +139,7 @@ struct kernel_impl_params {
135139
return std::static_pointer_cast<const PType>(desc)->type == PType::type_id();
136140
}
137141

138-
virtual primitive_type_id type() const { return desc->type; }
142+
virtual primitive_type_id type() const { return desc->type; }
139143

140144
void save(BinaryOutputBuffer& ob) const;
141145
void load(BinaryInputBuffer& ib);

src/plugins/intel_gpu/src/graph/fully_connected.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,9 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par
187187
return std::move(orig_impl_param);
188188
}
189189

190-
input_shape[input_row_idx] = align_to(input_shape[input_row_idx], 8);
191-
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], 8);
190+
size_t fake_align_base = (orig_impl_param.dev_type == cldnn::device_type::integrated_gpu) ? 16 : 8;
191+
input_shape[input_row_idx] = align_to(input_shape[input_row_idx], fake_align_base);
192+
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], fake_align_base);
192193

193194
updated_param.input_layouts[0] = layout(ov::PartialShape(input_shape),
194195
orig_input_layout.data_type,

src/plugins/intel_gpu/src/graph/include/program_node.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ struct program_node {
127127
}
128128

129129
virtual std::unique_ptr<kernel_impl_params> get_kernel_impl_params(const std::vector<layout>& in_layouts, const std::vector<layout>& out_layouts) const {
130-
auto params = std::unique_ptr<kernel_impl_params>(new kernel_impl_params(get_program(), get_program().get_stream_ptr(), get_primitive(),
130+
auto params = std::unique_ptr<kernel_impl_params>(new kernel_impl_params(get_program(), get_program().get_engine().get_device_info().dev_type,
131+
get_program().get_stream_ptr(), get_primitive(),
131132
get_unique_id(), in_layouts, out_layouts, get_fused_primitives()));
132133
params->memory_deps = get_const_memory_deps();
133134
params->_can_be_optimized = this->optimized;

src/plugins/intel_gpu/src/graph/kernel_impl_params.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "intel_gpu/graph/serialization/layout_serializer.hpp"
99
#include "intel_gpu/graph/serialization/string_serializer.hpp"
1010
#include "intel_gpu/graph/serialization/vector_serializer.hpp"
11+
#include "intel_gpu/runtime/device_info.hpp"
1112

1213
#include <string>
1314
#include <vector>
@@ -71,6 +72,7 @@ bool kernel_impl_params::operator==(const kernel_impl_params& rhs) const {
7172

7273
void kernel_impl_params::save(BinaryOutputBuffer& ob) const {
7374
ob << desc;
75+
ob << static_cast<uint64_t>(dev_type);
7476
ob << has_runtime_layouts;
7577
ob << unique_id;
7678
ob << input_layouts;
@@ -135,6 +137,9 @@ void kernel_impl_params::save(BinaryOutputBuffer& ob) const {
135137
void kernel_impl_params::load(BinaryInputBuffer& ib) {
136138
prog = nullptr;
137139
ib >> desc;
140+
size_t dev_type_id = 0;
141+
ib >> dev_type_id;
142+
dev_type = static_cast<cldnn::device_type>(dev_type_id);
138143
ib >> has_runtime_layouts;
139144
ib >> unique_id;
140145
ib >> input_layouts;

src/plugins/intel_gpu/src/plugin/ops/matmul.cpp

+12-2
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,18 @@ static void CreateMatMulOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0::
4646
return false;
4747

4848
// dynamic shapes and 1D tensors are not transposed
49-
if (shapes[0].is_dynamic() || shapes[1].is_dynamic() ||
50-
shapes[0].size() < 2 || shapes[1].size() < 2)
49+
if (shapes[0].is_dynamic() || shapes[1].is_dynamic()) {
50+
// Currently, cldnn optimized gemm kernel (gemm_tiled_opt) does not support transposed input with shape unaligned for 16.
51+
// If the shape is not aligned for 16, gemm_ref_kernel will be selected,
52+
// but the perf is worse than permute + gemm_tiled_opt.
53+
// So we'll use this permute + gemm_tiled_opt strategy as a temporal solution,
54+
// until we have an essential solution, i.e., fixing the gemm_tiled_opt kernel to support unaligned shape.
55+
if (p.get_engine().get_device_info().dev_type == cldnn::device_type::integrated_gpu)
56+
return true;
57+
else
58+
return false;
59+
}
60+
if (shapes[0].size() < 2 || shapes[1].size() < 2)
5161
return false;
5262

5363
// don't transpose inputs if they're aligned to 16

src/plugins/intel_gpu/tests/unit/fake_alignment/fc_fake_alignment_test.cpp

+29-12
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,11 @@ struct fc_fake_align_params {
2424
layout input_layout;
2525
layout weight_layout;
2626
data_types data_type;
27-
layout expected_input_layout;
28-
layout expected_output_layout;
27+
layout expected_input_layout_igpu;
28+
layout expected_output_layout_igpu;
29+
layout expected_input_layout_dgpu;
30+
layout expected_output_layout_dgpu;
31+
2932
};
3033

3134
class fully_connected_fake_align_test : public testing::TestWithParam<fc_fake_align_params> {};
@@ -54,8 +57,13 @@ TEST_P(fully_connected_fake_align_test, fake_alignment) {
5457
EXPECT_THROW(fully_connected_inst::get_fake_aligned_params(*impl_param), std::exception);
5558
} else {
5659
auto updated_param = fully_connected_inst::get_fake_aligned_params(*impl_param);
57-
ASSERT_EQ(updated_param.get_input_layout(), p.expected_input_layout);
58-
ASSERT_EQ(updated_param.get_output_layout(), p.expected_output_layout);
60+
if (!engine.get_device_info().supports_immad) {
61+
ASSERT_EQ(updated_param.get_input_layout(), p.expected_input_layout_igpu);
62+
ASSERT_EQ(updated_param.get_output_layout(), p.expected_output_layout_igpu);
63+
} else {
64+
ASSERT_EQ(updated_param.get_input_layout(), p.expected_input_layout_dgpu);
65+
ASSERT_EQ(updated_param.get_output_layout(), p.expected_output_layout_dgpu);
66+
}
5967
}
6068
}
6169

@@ -65,29 +73,38 @@ INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test,
6573
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
6674
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
6775
data_types::f16,
68-
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
69-
layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout
76+
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_igpu
77+
layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
78+
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_dgpu
79+
layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
7080
},
7181
{
7282
layout{ov::PartialShape{11, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
7383
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
7484
data_types::f16,
75-
layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
76-
layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout
85+
layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_igpu
86+
layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
87+
layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_dgpu
88+
layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
89+
7790
},
7891
{
7992
layout{ov::PartialShape{133, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
8093
layout{ov::PartialShape{800, 511}, data_types::i8, format::bfyx}, // weight layout
8194
data_types::f16,
82-
layout{ov::PartialShape{136, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
83-
layout{ov::PartialShape{136, 800}, data_types::f16, format::bfyx} // fake_aligned output layout
95+
layout{ov::PartialShape{144, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_igpu
96+
layout{ov::PartialShape{144, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
97+
layout{ov::PartialShape{136, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout_dgpu
98+
layout{ov::PartialShape{136, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
8499
},
85100
{
86101
layout{ov::PartialShape::dynamic(2), data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
87102
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
88103
data_types::f16,
89-
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout // dummy
90-
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx} // fake_aligned output layout // dummy
104+
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout_igpu // dummy
105+
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu // dummy
106+
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout_dgpu // dummy
107+
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu // dummy
91108
},
92109

93110
}));

src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp

+15-10
Original file line numberDiff line numberDiff line change
@@ -1874,6 +1874,7 @@ TEST(fully_connected_onednn, impl_replacement_with_cldnn) {
18741874

18751875
const int32_t input_f = 3, input_b = 1, weight_b = 4;
18761876

1877+
auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
18771878
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
18781879
auto input_data = engine.allocate_memory(layout{ ov::PartialShape{ input_b, input_f }, data_types::f32,format::bfyx });
18791880
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx });
@@ -1909,7 +1910,7 @@ TEST(fully_connected_onednn, impl_replacement_with_cldnn) {
19091910
auto output_prim_mem = outputs.begin()->second.get_memory();
19101911

19111912
auto out_l = network.get_output_layout(outputs.begin()->first);
1912-
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
1913+
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, fake_alignment_size)); // fake_alignment
19131914
ASSERT_EQ(out_l.batch(), input_b);
19141915
ASSERT_EQ(out_l.feature(), weight_b);
19151916
ASSERT_EQ(out_l.spatial(0), 1);
@@ -2045,6 +2046,7 @@ TEST(fully_connected_gpu, dynamic) {
20452046

20462047
const int32_t input_f = 3, input_b = 1, weight_b = 4;
20472048

2049+
auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
20482050
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
20492051
auto input_data = engine.allocate_memory(layout{ ov::PartialShape{ input_b, input_f }, data_types::f32,format::bfyx });
20502052
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx });
@@ -2071,7 +2073,7 @@ TEST(fully_connected_gpu, dynamic) {
20712073
auto output_prim_mem = outputs.begin()->second.get_memory();
20722074

20732075
auto out_l = network.get_output_layout(outputs.begin()->first);
2074-
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
2076+
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, fake_alignment_size)); // fake_alignment
20752077
ASSERT_EQ(out_l.batch(), input_b);
20762078
ASSERT_EQ(out_l.feature(), weight_b);
20772079
ASSERT_EQ(out_l.spatial(0), 1);
@@ -2199,7 +2201,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
21992201
auto input_data1 = engine.allocate_memory(input_actual_layout);
22002202
auto input_data2 = engine.allocate_memory(input_actual_layout);
22012203
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx });
2202-
2204+
auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
22032205
set_values(input_data1, { 0.5f, -2.0f, -0.5f });
22042206
set_values(input_data2, { -0.5f, 2.0f, 0.5f });
22052207
set_values(weights_data, { 1.5f, 1.0f, 0.5f,
@@ -2228,7 +2230,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
22282230
auto output_prim_mem = outputs.begin()->second.get_memory();
22292231

22302232
auto out_l = network.get_output_layout(outputs.begin()->first);
2231-
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
2233+
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, fake_alignment_size)); // fake_alignment
22322234
ASSERT_EQ(out_l.batch(), input_b);
22332235
ASSERT_EQ(out_l.feature(), weight_b);
22342236
ASSERT_EQ(out_l.spatial(0), 1);
@@ -2252,7 +2254,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
22522254
auto output_prim_mem = outputs.begin()->second.get_memory();
22532255

22542256
auto out_l = network.get_output_layout(outputs.begin()->first);
2255-
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
2257+
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, fake_alignment_size)); // fake_alignment
22562258
ASSERT_EQ(out_l.batch(), input_b);
22572259
ASSERT_EQ(out_l.feature(), weight_b);
22582260
ASSERT_EQ(out_l.spatial(0), 1);
@@ -2272,6 +2274,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_different_shape) {
22722274

22732275
const int32_t input_f = 3, weight_b = 4;
22742276

2277+
auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
22752278
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
22762279
auto input_actual_layout1 = layout{ ov::PartialShape{ 2, input_f }, data_types::f32,format::bfyx};
22772280
auto input_actual_layout2 = layout{ ov::PartialShape{ 1, input_f }, data_types::f32,format::bfyx};
@@ -2311,7 +2314,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_different_shape) {
23112314
auto output_prim_mem = outputs.begin()->second.get_memory();
23122315

23132316
auto out_l = network.get_output_layout(outputs.begin()->first);
2314-
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, 8)); // fake_alignment
2317+
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, fake_alignment_size)); // fake_alignment
23152318
ASSERT_EQ(out_l.batch(), 2);
23162319
ASSERT_EQ(out_l.feature(), weight_b);
23172320
ASSERT_EQ(out_l.spatial(0), 1);
@@ -2340,7 +2343,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_different_shape) {
23402343
auto output_prim_mem = outputs.begin()->second.get_memory();
23412344

23422345
auto out_l = network.get_output_layout(outputs.begin()->first);
2343-
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, 8)); // fake_alignment
2346+
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, fake_alignment_size)); // fake_alignment
23442347
ASSERT_EQ(out_l.batch(), 1);
23452348
ASSERT_EQ(out_l.feature(), weight_b);
23462349
ASSERT_EQ(out_l.spatial(0), 1);
@@ -2360,6 +2363,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_multiple_shapes) {
23602363

23612364
const int32_t input_f = 3, weight_b = 4;
23622365

2366+
auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
23632367
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
23642368
auto input_actual_layout1 = layout{ ov::PartialShape{ 2, input_f }, data_types::f32,format::bfyx};
23652369
auto input_actual_layout2 = layout{ ov::PartialShape{ 1, input_f }, data_types::f32,format::bfyx};
@@ -2398,7 +2402,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_multiple_shapes) {
23982402
auto output_prim_mem = outputs.begin()->second.get_memory();
23992403

24002404
auto out_l = network.get_output_layout(outputs.begin()->first);
2401-
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, 8)); // fake_alignment
2405+
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, fake_alignment_size)); // fake_alignment
24022406
ASSERT_EQ(out_l.batch(), 2); // fake_alignment
24032407
ASSERT_EQ(out_l.feature(), weight_b);
24042408
ASSERT_EQ(out_l.spatial(0), 1);
@@ -2427,7 +2431,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_multiple_shapes) {
24272431
auto output_prim_mem = outputs.begin()->second.get_memory();
24282432

24292433
auto out_l = network.get_output_layout(outputs.begin()->first);
2430-
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, 8)); // fake_alignment
2434+
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, fake_alignment_size)); // fake_alignment
24312435
ASSERT_EQ(out_l.batch(), 1); // fake_alignment
24322436
ASSERT_EQ(out_l.feature(), weight_b);
24332437
ASSERT_EQ(out_l.spatial(0), 1);
@@ -2661,6 +2665,7 @@ TEST(fully_connected_gpu, has_cached_weights_reorder) {
26612665

26622666
const int32_t input_f = 3, input_b = 1, weight_b = 4;
26632667

2668+
auto fake_alignment_size = engine.get_device_info().supports_immad ? 8 : 16;
26642669
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
26652670
auto input_data = engine.allocate_memory(layout{ ov::PartialShape{ input_b, input_f }, data_types::f32,format::bfyx });
26662671
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx });
@@ -2701,7 +2706,7 @@ TEST(fully_connected_gpu, has_cached_weights_reorder) {
27012706
ASSERT_TRUE(reorder_impl == nullptr);
27022707

27032708
auto out_l = network.get_output_layout(outputs.begin()->first);
2704-
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
2709+
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, fake_alignment_size)); // fake_alignment
27052710
ASSERT_EQ(out_l.batch(), input_b);
27062711
ASSERT_EQ(out_l.feature(), weight_b);
27072712
ASSERT_EQ(out_l.spatial(0), 1);

0 commit comments

Comments
 (0)