Skip to content

Commit 8da6404

Browse files
author
Roman Dubtsov
committedJan 9, 2020
style: update _clang-format
1 parent c07c09e commit 8da6404

3 files changed

+95
-80
lines changed
 

‎_clang-format

+2
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ SpacesInSquareBrackets: false
114114
Standard: Cpp11
115115
StatementMacros:
116116
- for_
117+
- PRAGMA_OMP
118+
- PRAGMA_OMP_SIMD
117119
TabWidth: 4
118120
UseTab: Never
119121
...

‎src/cpu/jit_avx512_common_convolution_winograd.cpp

+20-19
Original file line numberDiff line numberDiff line change
@@ -1081,18 +1081,19 @@ void jit_avx512_common_convolution_winograd_bwd_weights_t::
10811081
array_offset_calculator<float, 2> diff_bias_prv(
10821082
scratchpad.get<float>(key_conv_bia_reduction), nthreads, jcp.oc);
10831083

1084-
PRAGMA_OMP(parallel num_threads(nthreads)) {
1084+
PRAGMA_OMP(parallel num_threads(nthreads))
1085+
{
10851086
if (jcp.with_bias) {
10861087
parallel_nd_in_omp(nthreads, jcp.oc, [&](int ithr, int ofm) {
10871088
diff_bias_prv(ithr, ofm) = 0.0f;
10881089
});
10891090

1090-
PRAGMA_OMP(for nowait)
1091-
for (int bofm = 0; bofm < jcp.oc / simd_w; bofm++) {
1092-
PRAGMA_OMP_SIMD()
1093-
for (int v = 0; v < simd_w; v++)
1094-
diff_bias(bofm, v) = 0.0f;
1095-
}
1091+
PRAGMA_OMP(for nowait)
1092+
for (int bofm = 0; bofm < jcp.oc / simd_w; bofm++) {
1093+
PRAGMA_OMP_SIMD()
1094+
for (int v = 0; v < simd_w; v++)
1095+
diff_bias(bofm, v) = 0.0f;
1096+
}
10961097
}
10971098

10981099
const int ithread = dnnl_get_thread_num();
@@ -1154,18 +1155,18 @@ for (int bofm = 0; bofm < jcp.oc / simd_w; bofm++) {
11541155
});
11551156

11561157
if (jcp.with_bias) {
1157-
PRAGMA_OMP(for)
1158-
for (int ofm1 = 0; ofm1 < jcp.oc / simd_w; ofm1++) {
1159-
for (int ithr = 0; ithr < nthreads; ithr++) {
1160-
float *base_bias_ptr = &(diff_bias(ofm1, 0));
1161-
float *base_bias_prv_ptr
1162-
= &(diff_bias_prv(ithr * jcp.oc + ofm1 * simd_w));
1163-
PRAGMA_OMP_SIMD()
1164-
for (int ofm2 = 0; ofm2 < simd_w; ofm2++) {
1165-
base_bias_ptr[ofm2] += base_bias_prv_ptr[ofm2];
1166-
}
1167-
}
1168-
}
1158+
PRAGMA_OMP(for)
1159+
for (int ofm1 = 0; ofm1 < jcp.oc / simd_w; ofm1++) {
1160+
for (int ithr = 0; ithr < nthreads; ithr++) {
1161+
float *base_bias_ptr = &(diff_bias(ofm1, 0));
1162+
float *base_bias_prv_ptr
1163+
= &(diff_bias_prv(ithr * jcp.oc + ofm1 * simd_w));
1164+
PRAGMA_OMP_SIMD()
1165+
for (int ofm2 = 0; ofm2 < simd_w; ofm2++) {
1166+
base_bias_ptr[ofm2] += base_bias_prv_ptr[ofm2];
1167+
}
1168+
}
1169+
}
11691170
}
11701171
}
11711172

‎src/cpu/jit_avx512_core_f32_wino_conv_4x3.cpp

+73-61
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,8 @@ void subarray_sum(size_t num_arrs, float *output, size_t nelems,
502502
const size_t blocks_number = nelems / block_size;
503503
const size_t tail = nelems % block_size;
504504

505-
PRAGMA_OMP(parallel) {
505+
PRAGMA_OMP(parallel)
506+
{
506507
const int ithr = dnnl_get_thread_num();
507508
const int nthr = dnnl_get_num_threads();
508509
size_t start {0}, end {0};
@@ -583,7 +584,8 @@ void array_sum(size_t num_arrs, float *output, size_t nelems,
583584
const size_t blocks_number = nelems / block_size;
584585
const size_t tail = nelems % block_size;
585586

586-
PRAGMA_OMP(parallel) {
587+
PRAGMA_OMP(parallel)
588+
{
587589
const size_t ithr = dnnl_get_thread_num();
588590
const size_t nthr = dnnl_get_num_threads();
589591
size_t start {0}, end {0};
@@ -672,7 +674,8 @@ void jit_avx512_core_f32_wino_conv_4x3_bwd_weights_t::
672674
0.179271708683473f, 0.403361344537815f, 1.13777777777778f};
673675
float G_O_3x3_4x4[4] = {2.25f, 0.625f, 1.5f, 0.390625f};
674676

675-
PRAGMA_OMP(parallel num_threads(nthreads) firstprivate(trans_ker_p, I, T)) {
677+
PRAGMA_OMP(parallel num_threads(nthreads) firstprivate(trans_ker_p, I, T))
678+
{
676679
if (jcp.with_bias) {
677680
parallel_nd_in_omp(
678681
nthreads, jcp.oc / simd_w, [&](int ithr, int ofm) {
@@ -687,69 +690,76 @@ void jit_avx512_core_f32_wino_conv_4x3_bwd_weights_t::
687690
int ithr = dnnl_get_thread_num();
688691
for (int ifm1 = 0; ifm1 < jcp.nb_ic; ++ifm1) {
689692
int first_tblk = 0;
690-
PRAGMA_OMP(for)
691-
for (int tblk1 = 0; tblk1 < jcp.tile_block; ++tblk1) {
692-
int tile_index = tblk1 * jcp.nb_tile_block_ur * jcp.tile_block_ur;
693-
int img = tile_index / (jcp.itiles * jcp.jtiles);
694-
trans_ker_p.ti = tile_index % jcp.itiles;
695-
trans_ker_p.tj = (tile_index / jcp.itiles) % jcp.jtiles;
696-
trans_ker_p.M = I;
697-
trans_ker_p.T = T;
698-
trans_ker_p.G = G_I_3x3_4x4;
699-
for (int ifm2 = 0; ifm2 < jcp.ic_block; ++ifm2) {
700-
int ifm = ifm1 * jcp.ic_block + ifm2;
701-
trans_ker_p.src = (float *)&(src(img, ifm, 0, 0, 0));
702-
trans_ker_p.dst = (float *)&(V(ithr, 0, 0, ifm2, 0, 0, 0));
703-
kernel_->src_transform(&trans_ker_p);
704-
}
705-
706-
for (int ofm1 = 0; ofm1 < jcp.nb_oc; ++ofm1) {
707-
trans_ker_p.G = G_W_3x3_4x4;
708-
for (int ofm2 = 0; ofm2 < jcp.oc_block; ++ofm2) {
709-
int ofm = (ofm1 * jcp.oc_block + ofm2) * jcp.oc_reg_block;
710-
trans_ker_p.src = (float *)&(diff_dst(img, ofm, 0, 0, 0));
711-
trans_ker_p.dst = (float *)&(M(ithr, 0, 0, ofm2, 0, 0, 0, 0));
712-
if (jcp.with_bias && ifm1 == 0) {
713-
trans_ker_p.bias
714-
= (float *)&(diff_bias_prv(ithr, ofm * simd_w));
715-
kernel_->diff_dst_transform_wbias(&trans_ker_p);
716-
} else {
717-
kernel_->diff_dst_transform(&trans_ker_p);
718-
}
719-
}
720-
721-
for (int oj = 0; oj < alpha; ++oj) {
722-
for (int oi = 0; oi < alpha; ++oi) {
723-
kernel_->gemm_loop_ker_first_iter(
724-
&(Us(ithr, oj, oi, 0, 0, 0, 0, 0)),
725-
&(M(ithr, oj, oi, 0, 0, 0, 0, 0)),
726-
&(V(ithr, oj, oi, 0, 0, 0, 0)));
727-
}
728-
}
729-
trans_ker_p.G = G_O_3x3_4x4;
730-
for (int ofm2 = 0; ofm2 < jcp.oc_block; ++ofm2) {
731-
for (int ofm3 = 0; ofm3 < jcp.oc_reg_block; ++ofm3) {
732-
int ofm = (ofm1 * jcp.oc_block + ofm2) * jcp.oc_reg_block
733-
+ ofm3;
693+
PRAGMA_OMP(for)
694+
for (int tblk1 = 0; tblk1 < jcp.tile_block; ++tblk1) {
695+
int tile_index
696+
= tblk1 * jcp.nb_tile_block_ur * jcp.tile_block_ur;
697+
int img = tile_index / (jcp.itiles * jcp.jtiles);
698+
trans_ker_p.ti = tile_index % jcp.itiles;
699+
trans_ker_p.tj = (tile_index / jcp.itiles) % jcp.jtiles;
700+
trans_ker_p.M = I;
701+
trans_ker_p.T = T;
702+
trans_ker_p.G = G_I_3x3_4x4;
734703
for (int ifm2 = 0; ifm2 < jcp.ic_block; ++ifm2) {
735704
int ifm = ifm1 * jcp.ic_block + ifm2;
736-
trans_ker_p.src = (float *)&(
737-
Us(ithr, 0, 0, ofm2, ifm2, 0, ofm3, 0));
738-
trans_ker_p.dst = (float *)&(
739-
diff_weights_prv(ithr, ofm, ifm, 0, 0, 0, 0));
740-
if (first_tblk == 0) {
741-
kernel_->diff_weights_transform(&trans_ker_p);
742-
} else {
743-
kernel_->diff_weights_transform_accum(&trans_ker_p);
705+
trans_ker_p.src = (float *)&(src(img, ifm, 0, 0, 0));
706+
trans_ker_p.dst = (float *)&(V(ithr, 0, 0, ifm2, 0, 0, 0));
707+
kernel_->src_transform(&trans_ker_p);
708+
}
709+
710+
for (int ofm1 = 0; ofm1 < jcp.nb_oc; ++ofm1) {
711+
trans_ker_p.G = G_W_3x3_4x4;
712+
for (int ofm2 = 0; ofm2 < jcp.oc_block; ++ofm2) {
713+
int ofm = (ofm1 * jcp.oc_block + ofm2)
714+
* jcp.oc_reg_block;
715+
trans_ker_p.src
716+
= (float *)&(diff_dst(img, ofm, 0, 0, 0));
717+
trans_ker_p.dst
718+
= (float *)&(M(ithr, 0, 0, ofm2, 0, 0, 0, 0));
719+
if (jcp.with_bias && ifm1 == 0) {
720+
trans_ker_p.bias = (float *)&(
721+
diff_bias_prv(ithr, ofm * simd_w));
722+
kernel_->diff_dst_transform_wbias(&trans_ker_p);
723+
} else {
724+
kernel_->diff_dst_transform(&trans_ker_p);
725+
}
726+
}
727+
728+
for (int oj = 0; oj < alpha; ++oj) {
729+
for (int oi = 0; oi < alpha; ++oi) {
730+
kernel_->gemm_loop_ker_first_iter(
731+
&(Us(ithr, oj, oi, 0, 0, 0, 0, 0)),
732+
&(M(ithr, oj, oi, 0, 0, 0, 0, 0)),
733+
&(V(ithr, oj, oi, 0, 0, 0, 0)));
734+
}
735+
}
736+
trans_ker_p.G = G_O_3x3_4x4;
737+
for (int ofm2 = 0; ofm2 < jcp.oc_block; ++ofm2) {
738+
for (int ofm3 = 0; ofm3 < jcp.oc_reg_block; ++ofm3) {
739+
int ofm = (ofm1 * jcp.oc_block + ofm2)
740+
* jcp.oc_reg_block
741+
+ ofm3;
742+
for (int ifm2 = 0; ifm2 < jcp.ic_block; ++ifm2) {
743+
int ifm = ifm1 * jcp.ic_block + ifm2;
744+
trans_ker_p.src = (float *)&(
745+
Us(ithr, 0, 0, ofm2, ifm2, 0, ofm3, 0));
746+
trans_ker_p.dst = (float *)&(diff_weights_prv(
747+
ithr, ofm, ifm, 0, 0, 0, 0));
748+
if (first_tblk == 0) {
749+
kernel_->diff_weights_transform(
750+
&trans_ker_p);
751+
} else {
752+
kernel_->diff_weights_transform_accum(
753+
&trans_ker_p);
754+
}
755+
}
756+
}
744757
}
745758
}
759+
++first_tblk;
746760
}
747761
}
748762
}
749-
++first_tblk;
750-
}
751-
}
752-
}
753763

754764
// Reduce diff-weights
755765
{
@@ -826,7 +836,8 @@ void jit_avx512_core_f32_wino_conv_4x3_bwd_weights_t::
826836
float I[alpha][alpha][simd_w];
827837
float T[alpha][alpha][simd_w];
828838

829-
PRAGMA_OMP(parallel firstprivate(first_tblk, trans_ker_p, I, T)) {
839+
PRAGMA_OMP(parallel firstprivate(first_tblk, trans_ker_p, I, T))
840+
{
830841
if (jcp.with_bias) {
831842
parallel_nd_in_omp(nthreads, jcp.oc, [&](int ithr, int ofm) {
832843
diff_bias_prv(ithr, ofm) = 0.0f;
@@ -923,7 +934,8 @@ void jit_avx512_core_f32_wino_conv_4x3_bwd_weights_t::
923934
}
924935

925936
trans_ker_p.G = G_O_3x3_4x4;
926-
PRAGMA_OMP(parallel firstprivate(trans_ker_p)) {
937+
PRAGMA_OMP(parallel firstprivate(trans_ker_p))
938+
{
927939
parallel_nd_in_omp(jcp.nb_ic, jcp.nb_oc, jcp.oc_block, jcp.ic_block,
928940
jcp.oc_reg_block,
929941
[&](int ifm1, int ofm1, int ofm2, int ifm2, int ofm3) {

0 commit comments

Comments
 (0)
Please sign in to comment.