From b43fc37491b69e6af68749d74bcf23aa4a92a3b3 Mon Sep 17 00:00:00 2001 From: Dwijen Chawra Date: Tue, 3 Dec 2024 01:48:03 -0500 Subject: [PATCH 1/8] added code to xgboost so we can lower states on the fly --- conv2dout_lowered.txt | 50145 ++++++++++++++++ llpass.py | 162 + lowered_tir.py | 481 + .../auto_scheduler/cost_model/xgb_model.py | 33 + tune_conv2d_layer_cuda.py | 226 + 5 files changed, 51047 insertions(+) create mode 100644 conv2dout_lowered.txt create mode 100644 llpass.py create mode 100644 lowered_tir.py create mode 100644 tune_conv2d_layer_cuda.py diff --git a/conv2dout_lowered.txt b/conv2dout_lowered.txt new file mode 100644 index 000000000000..f2a1c453f8c5 --- /dev/null +++ b/conv2dout_lowered.txt @@ -0,0 +1,50145 @@ +---------------------------------------------------------------------- +------------------------------ [ Search ] +---------------------------------------------------------------------- +Generate Sketches #s: 1 +Sample Initial Population #s: 66 fail_ct: 1982 Time elapsed: 2.01 +GA Iter: 0 Max score: 0.9867 Min score: 0.0250 #Pop: 66 #M+: 0 #M-: 0 +GA Iter: 4 Max score: 0.9997 Min score: 0.9788 #Pop: 128 #M+: 1390 #M-: 0 +EvolutionarySearch #s: 128 Time elapsed: 30.83 +---------------------------------------------------------------------- +------------------------------ [ Measure ] +---------------------------------------------------------------------- +Get 10 programs to measure: +..........**********================================================== +No: 1 GFLOPS: 40.51 / 40.51 results: MeasureResult(cost:[0.0057], error_no:0, all_cost:1.67, Tstamp:1732601215.31) +================================================= +Placeholder: data, kernel, bias +blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,8) + vthread i0.1@i1.1@i2.1@i3.1@ (0,2) + threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,16) + for rc.0 (0,32) + for ry.0 (0,3) + for ax0@ax1@ax2@ax3@.0.0 (0,192) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,16) + kernel.shared = ... + for ax0@ax1@ax2@ax3@.0.0 (0,3) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,16) + vectorize ax0@ax1@ax2@ax3@.1 (0,24) + pad_temp.shared = ... + for rc.1 (0,16) + for yy.3 (0,7) + for xx.3 (0,7) + for rx.2 (0,3) + for ff.4 (0,2) + conv2d_nchw = ... + for i1.3 (0,2) + for i2.3 (0,7) + for i3.3 (0,7) + compute = ... + +================================================== +No: 2 GFLOPS: 356.02 / 356.02 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:0.95, Tstamp:1732601215.87) +================================================== +Placeholder: data, kernel, bias +blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,4) + vthread i0.1@i1.1@i2.1@i3.1@ (0,7) + threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,128) + for rc.0 (0,512) + for ax0@ax1@ax2@ax3@.0.0 (0,3) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) + vectorize ax0@ax1@ax2@ax3@.1 (0,3) + kernel.shared = ... + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) + pad_temp.shared = ... + for ry.2 (0,3) + for rx.2 (0,3) + for xx.4 (0,7) + conv2d_nchw = ... + for i3.3 (0,7) + compute = ... + +================================================== +No: 3 GFLOPS: 505.45 / 505.45 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:11.00, Tstamp:1732601226.47) +================================================== +Placeholder: data, kernel, bias +blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,16) + threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,392) + conv2d_nchw auto_unroll: 16 + for rc.0 (0,16) + for ax0@ax1@ax2@ax3@.0.0 (0,24) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,392) + kernel.shared = ... + for ax0@ax1@ax2@ax3@.0.0 (0,7) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,392) + pad_temp.shared = ... + for rc.1 (0,32) + for rx.1 (0,3) + for ff.3 (0,4) + for ry.2 (0,3) + conv2d_nchw = ... + for i1.3 (0,4) + compute = ... + +================================================== +No: 4 GFLOPS: 218.94 / 505.45 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.55, Tstamp:1732601227.51) +================================================== +Placeholder: data, kernel, bias +blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,16) + vthread i0.1@i1.1@i2.1@i3.1@ (0,7) + threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,56) + conv2d_nchw auto_unroll: 64 + for rc.0 (0,32) + for ax0@ax1@ax2@ax3@.0.0 (0,83) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,56) + kernel.shared = ... + for ax0@ax1@ax2@ax3@.0.0 (0,24) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,56) + pad_temp.shared = ... + for rx.1 (0,3) + for ff.3 (0,4) + for rc.2 (0,16) + for ry.2 (0,3) + conv2d_nchw = ... + for i1.3 (0,4) + compute = ... + +================================================== +No: 5 GFLOPS: 259.70 / 505.45 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:0.95, Tstamp:1732601228.07) +================================================== +Placeholder: data, kernel, bias +blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,8) + vthread i0.1@i1.1@i2.1@i3.1@ (0,7) + threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,112) + for rc.0 (0,32) + for ry.0 (0,3) + for ax0@ax1@ax2@ax3@.0.0 (0,28) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,112) + kernel.shared = ... + for ax0@ax1@ax2@ax3@.0.0 (0,9) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,112) + pad_temp.shared = ... + for rc.1 (0,16) + for rx.2 (0,3) + for ff.4 (0,4) + conv2d_nchw = ... + for i1.3 (0,4) + compute = ... + +================================================== +No: 6 GFLOPS: 349.91 / 505.45 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:0.93, Tstamp:1732601228.72) +================================================== +Placeholder: data, kernel, bias +blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,8) + threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,448) + conv2d_nchw auto_unroll: 64 + for rc.0 (0,512) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,448) + vectorize ax0@ax1@ax2@ax3@.1 (0,3) + kernel.shared = ... + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,448) + pad_temp.shared = ... + for ry.2 (0,3) + for rx.2 (0,3) + for yy.4 (0,7) + conv2d_nchw = ... + for i2.3 (0,7) + compute = ... + +================================================== +No: 7 GFLOPS: 189.28 / 505.45 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:11.13, Tstamp:1732601239.33) +================================================== +Placeholder: data, kernel, bias +blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,4) + vthread i0.1@i1.1@i2.1@i3.1@ (0,7) + threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,64) + conv2d_nchw auto_unroll: 64 + for rc.0 (0,64) + for ax0@ax1@ax2@ax3@.0.0 (0,144) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,64) + kernel.shared = ... + for ax0@ax1@ax2@ax3@.0.0 (0,11) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,64) + pad_temp.shared = ... + for rc.1 (0,8) + for ff.3 (0,2) + for ry.2 (0,3) + for rx.2 (0,3) + for yy.4 (0,7) + conv2d_nchw = ... + for i1.3 (0,2) + for i2.3 (0,7) + compute = ... + +================================================== +No: 8 GFLOPS: 211.81 / 505.45 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:10.46, Tstamp:1732601249.41) +================================================== +Placeholder: data, kernel, bias +blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,14) + threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,128) + for rc.0 (0,256) + for ry.0 (0,3) + for rx.0 (0,3) + for ax0@ax1@ax2@ax3@.0.0 (0,4) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) + kernel.shared = ... + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) + pad_temp.shared = ... + for rc.1 (0,2) + for ff.4 (0,2) + for yy.4 (0,7) + conv2d_nchw = ... + for i1.3 (0,2) + for i2.3 (0,7) + compute = ... + +================================================== +No: 9 GFLOPS: 106.17 / 505.45 results: MeasureResult(cost:[0.0022], error_no:0, all_cost:0.95, Tstamp:1732601249.98) +================================================== +Placeholder: data, kernel, bias +blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,28) + vthread i0.1@i1.1@i2.1@i3.1@ (0,4) + threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,56) + conv2d_nchw auto_unroll: 16 + for rc.0 (0,256) + for ax0@ax1@ax2@ax3@.0.0 (0,42) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,56) + kernel.shared = ... + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,56) + pad_temp.shared = ... + for ry.1 (0,3) + for rx.1 (0,3) + for ff.3 (0,2) + for rc.2 (0,2) + for ff.4 (0,2) + conv2d_nchw = ... + for i1.3 (0,4) + compute = ... + +================================================== +No: 10 GFLOPS: 315.68 / 505.45 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.12, Tstamp:1732601250.54) +================================================== +Placeholder: data, kernel, bias +blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,14) + threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,128) + conv2d_nchw auto_unroll: 1024 + for rc.0 (0,128) + for ry.0 (0,3) + for rx.0 (0,3) + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) + vectorize ax0@ax1@ax2@ax3@.1 (0,9) + kernel.shared = ... + threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) + pad_temp.shared = ... + for rc.2 (0,4) + for ff.4 (0,2) + for yy.4 (0,7) + conv2d_nchw = ... + for i1.3 (0,2) + for i2.3 (0,7) + compute = ... + +Time elapsed for measurement: 38.33 s +---------------------------------------------------------------------- +------------------------------ [ Done ] +---------------------------------------------------------------------- +Computational DAG: +data = PLACEHOLDER [1, 512, 7, 7] +pad_temp(i0, i1, i2, i3) = tir.if_then_else(((((i2 >= 1) && (i2 < 8)) && (i3 >= 1)) && (i3 < 8)), data[i0, i1, (i2 - 1), (i3 - 1)], 0f) +kernel = PLACEHOLDER [512, 512, 3, 3] +conv2d_nchw(nn, ff, yy, xx) += (pad_temp[nn, rc, (yy + ry), (xx + rx)]*kernel[ff, rc, ry, rx]) +bias = PLACEHOLDER [1, 512, 1, 1] +T_add(ax0, ax1, ax2, ax3) = (conv2d_nchw[ax0, ax1, ax2, ax3] + bias[ax0, ax1, 0, 0]) +compute(i0, i1, i2, i3) = max(T_add[i0, i1, i2, i3], 0f) + +Get devices for measurement successfully! + +Phase 0 +-------------------- +0 + +1 + +0 + +512 + +0 + +7 + +0 + +7 + +T.bool(True) + +64 + +1 + +49 + +0 + +1 + +blockIdx_x + +8 + +blockIdx_x * 8 + +8 + +threadIdx_x + +7 + +threadIdx_x // 7 + +1 + +7 + +threadIdx_x % 7 + +1 + +T.bool(True) + +1024 + +1 + +0 + +2 + +0 + +4 + +T.float32(0.0) + +0 + +ff_inner_init + +ff_outer_inner_init + +ff_outer_inner_init * 4 + +ff_inner_init + ff_outer_inner_init * 4 + +ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8 + +conv2d_nchw = T.Buffer((1, 512, 7, 7)) +ff_inner_init = T.int32() +ff_outer_inner_init = T.int32() +blockIdx_x = T.int32() +threadIdx_x = T.int32() +conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + +for ff_inner_init in range(4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + ff_outer_inner_init = T.int32() + blockIdx_x = T.int32() + threadIdx_x = T.int32() + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + +for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + blockIdx_x = T.int32() + threadIdx_x = T.int32() + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + +0 + +8 + +0 + +3 + +0 + +1 + +rc_outer_outer + +64 + +rc_outer_outer * 64 + +64 + +0 + +9 + +rx_outer_outer + +7 + +T.bool(True) + +0 + +83 + +49 + +threadIdx_x + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + +threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1 + +T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1) + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64 + +T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64) + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7 + +T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7) + +64 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64 + +T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64) + +576 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576 + +T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576) + +4032 + +threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032 + +T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032) + +4032 + +threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032 + +T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032) + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 + +1 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 + +8 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 + +1 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 + +8 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8 + +1 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1 + +1 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1 + +data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1] + +T.float32(0.0) + +T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((1, 512, 7, 7)) +rc_outer_outer = T.int32() +pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((1, 512, 7, 7)) + rc_outer_outer = T.int32() + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((1, 512, 7, 7)) + rc_outer_outer = T.int32() + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((1, 512, 7, 7)) + rc_outer_outer = T.int32() + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((1, 512, 7, 7)) + rc_outer_outer = T.int32() + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +rx_outer_outer = T.int32() +if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + data = T.Buffer((1, 512, 7, 7)) + rc_outer_outer = T.int32() + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +rc_outer_outer = T.int32() +if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + rx_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + rc_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + rx_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + rc_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + rx_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + rc_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + rx_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + +8 + +blockIdx_x * 8 + +8 + +64 + +rc_outer_outer * 64 + +64 + +0 + +3 + +1 + +T.bool(True) + +0 + +32 + +49 + +threadIdx_x + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + +threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8 + +T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8) + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64 + +T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64) + +rx_outer_outer - rx_outer_outer + +rx_outer_outer - rx_outer_outer < 1 + +T.likely(rx_outer_outer - rx_outer_outer < 1) + +512 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512 + +T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512) + +1536 + +threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536 + +T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536) + +1536 + +threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536 + +T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536) + +1536 + +threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536 + +T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536) + +512 + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512 + +T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512) + +(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3 + +kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") +kernel = T.Buffer((512, 512, 3, 3)) +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +blockIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +blockIdx_x = T.int32() +if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + kernel = T.Buffer((512, 512, 3, 3)) + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + blockIdx_x = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + kernel = T.Buffer((512, 512, 3, 3)) + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + blockIdx_x = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + kernel = T.Buffer((512, 512, 3, 3)) + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + blockIdx_x = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + kernel = T.Buffer((512, 512, 3, 3)) + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + blockIdx_x = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + kernel = T.Buffer((512, 512, 3, 3)) + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +rx_outer_outer = T.int32() +if T.likely(rx_outer_outer - rx_outer_outer < 1): + threadIdx_x = T.int32() + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + blockIdx_x = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + kernel = T.Buffer((512, 512, 3, 3)) + rc_outer_outer = T.int32() + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +rc_outer_outer = T.int32() +if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + rx_outer_outer = T.int32() + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + blockIdx_x = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +threadIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +blockIdx_x = T.int32() +if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + rc_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + rx_outer_outer = T.int32() + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() + blockIdx_x = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + rc_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + rx_outer_outer = T.int32() + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + blockIdx_x = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + rc_outer_outer = T.int32() + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + rx_outer_outer = T.int32() + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + +0 + +8 + +0 + +3 + +0 + +0 + +8 + +0 + +0 + +ff_inner + +ff_outer_inner + +ff_outer_inner * 4 + +ff_inner + ff_outer_inner * 4 + +ff_inner + ff_outer_inner * 4 + blockIdx_x * 8 + +conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + +rc_inner + +rc_outer_inner + +rc_outer_outer * 8 + +rc_outer_inner + rc_outer_outer * 8 + +(rc_outer_inner + rc_outer_outer * 8) * 8 + +rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8 + +ry_outer_inner + +threadIdx_x // 7 + ry_outer_inner + +threadIdx_x % 7 + rx_outer_outer + +pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] + +kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +conv2d_nchw = T.Buffer((1, 512, 7, 7)) +ff_inner = T.int32() +ff_outer_inner = T.int32() +blockIdx_x = T.int32() +threadIdx_x = T.int32() +pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") +rc_inner = T.int32() +rc_outer_inner = T.int32() +rc_outer_outer = T.int32() +ry_outer_inner = T.int32() +rx_outer_outer = T.int32() +kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") +conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +for ff_inner in range(4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + ff_outer_inner = T.int32() + blockIdx_x = T.int32() + threadIdx_x = T.int32() + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + rc_inner = T.int32() + rc_outer_inner = T.int32() + rc_outer_outer = T.int32() + ry_outer_inner = T.int32() + rx_outer_outer = T.int32() + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +for rc_inner, ff_inner in T.grid(8, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + ff_outer_inner = T.int32() + blockIdx_x = T.int32() + threadIdx_x = T.int32() + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + rc_outer_inner = T.int32() + rc_outer_outer = T.int32() + ry_outer_inner = T.int32() + rx_outer_outer = T.int32() + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +for ff_outer_inner, rc_inner, ff_inner in T.grid(2, 8, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + blockIdx_x = T.int32() + threadIdx_x = T.int32() + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + rc_outer_inner = T.int32() + rc_outer_outer = T.int32() + ry_outer_inner = T.int32() + rx_outer_outer = T.int32() + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +for ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(3, 2, 8, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + blockIdx_x = T.int32() + threadIdx_x = T.int32() + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + rc_outer_inner = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + blockIdx_x = T.int32() + threadIdx_x = T.int32() + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +blockIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") +for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] +for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + threadIdx_x = T.int32() + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") +blockIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +with T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]): + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + threadIdx_x = T.int32() + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") +for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) +kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") +blockIdx_x = T.int32() +T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) +for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] +for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + threadIdx_x = T.int32() + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +with T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]): + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + blockIdx_x = T.int32() + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + threadIdx_x = T.int32() + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +for rx_outer_outer in range(3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + rc_outer_outer = T.int32() + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + blockIdx_x = T.int32() + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + threadIdx_x = T.int32() + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + blockIdx_x = T.int32() + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + threadIdx_x = T.int32() + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +conv2d_nchw = T.Buffer((1, 512, 7, 7)) +blockIdx_x = T.int32() +for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + threadIdx_x = T.int32() + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) +threadIdx_x = T.int32() +for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +nn_outer_outer_outer_outer = T.int32() +with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1): + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + blockIdx_x = T.int32() + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + threadIdx_x = T.int32() + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + threadIdx_x = T.int32() + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +nn_outer_outer_outer_outer = T.int32() +with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + blockIdx_x = T.int32() + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + threadIdx_x = T.int32() + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + threadIdx_x = T.int32() + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + +0 + +8 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 // 7 + +vthread + +64 + +blockIdx_x // 64 + +vthread + blockIdx_x // 64 + +threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64) + +i1_inner + +blockIdx_x % 64 + +blockIdx_x % 64 * 8 + +i1_inner + blockIdx_x % 64 * 8 + +threadIdx_x // 7 % 7 + +threadIdx_x % 7 + +conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + +0 + +0 + +bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0] + +conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0] + +T.float32(0.0) + +T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) + +compute = T.Buffer((1, 512, 7, 7)) +conv2d_nchw = T.Buffer((1, 512, 7, 7)) +threadIdx_x = T.int32() +vthread = T.int32() +blockIdx_x = T.int32() +i1_inner = T.int32() +bias = T.Buffer((1, 512, 1, 1)) +compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) + +for i1_inner in range(8): + compute = T.Buffer((1, 512, 7, 7)) + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + threadIdx_x = T.int32() + vthread = T.int32() + blockIdx_x = T.int32() + bias = T.Buffer((1, 512, 1, 1)) + compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) + +nn_outer_outer_outer_outer = T.int32() +conv2d_nchw = T.Buffer((1, 512, 7, 7)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] +for i1_inner in range(8): + compute = T.Buffer((1, 512, 7, 7)) + vthread = T.int32() + bias = T.Buffer((1, 512, 1, 1)) + compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) + +conv2d_nchw = T.Buffer((1, 512, 7, 7)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +with T.realize(conv2d_nchw[0:1, blockIdx_x * 8:blockIdx_x * 8 + 8, threadIdx_x // 7:threadIdx_x // 7 + 1, threadIdx_x % 7:threadIdx_x % 7 + 1]): + nn_outer_outer_outer_outer = T.int32() + with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + for i1_inner in range(8): + compute = T.Buffer((1, 512, 7, 7)) + vthread = T.int32() + bias = T.Buffer((1, 512, 1, 1)) + compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + blockIdx_x = T.int32() + T.realize(conv2d_nchw[0:1, blockIdx_x * 8:blockIdx_x * 8 + 8, threadIdx_x // 7:threadIdx_x // 7 + 1, threadIdx_x % 7:threadIdx_x % 7 + 1]) + nn_outer_outer_outer_outer = T.int32() + with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + for i1_inner in range(8): + compute = T.Buffer((1, 512, 7, 7)) + vthread = T.int32() + bias = T.Buffer((1, 512, 1, 1)) + compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) + +with T.launch_thread("vthread", 1) as vthread: + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + blockIdx_x = T.int32() + T.realize(conv2d_nchw[0:1, blockIdx_x * 8:blockIdx_x * 8 + 8, threadIdx_x // 7:threadIdx_x // 7 + 1, threadIdx_x % 7:threadIdx_x % 7 + 1]) + nn_outer_outer_outer_outer = T.int32() + with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + for i1_inner in range(8): + compute = T.Buffer((1, 512, 7, 7)) + bias = T.Buffer((1, 512, 1, 1)) + compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) + +with T.launch_thread("blockIdx.x", 64) as blockIdx_x: + vthread = T.launch_thread("vthread", 1) + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + T.realize(conv2d_nchw[0:1, blockIdx_x * 8:blockIdx_x * 8 + 8, threadIdx_x // 7:threadIdx_x // 7 + 1, threadIdx_x % 7:threadIdx_x % 7 + 1]) + nn_outer_outer_outer_outer = T.int32() + with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + for i1_inner in range(8): + compute = T.Buffer((1, 512, 7, 7)) + bias = T.Buffer((1, 512, 1, 1)) + compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) + +compute = T.Buffer((1, 512, 7, 7)) +with T.realize(compute[0:1, 0:512, 0:7, 0:7]): + blockIdx_x = T.launch_thread("blockIdx.x", 64) + vthread = T.launch_thread("vthread", 1) + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw = T.Buffer((1, 512, 7, 7)) + T.realize(conv2d_nchw[0:1, blockIdx_x * 8:blockIdx_x * 8 + 8, threadIdx_x // 7:threadIdx_x // 7 + 1, threadIdx_x % 7:threadIdx_x % 7 + 1]) + nn_outer_outer_outer_outer = T.int32() + with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") + T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): + data = T.Buffer((1, 512, 7, 7)) + pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) + kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") + T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): + if T.likely(rx_outer_outer - rx_outer_outer < 1): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): + if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): + kernel = T.Buffer((512, 512, 3, 3)) + kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] + for i1_inner in range(8): + bias = T.Buffer((1, 512, 1, 1)) + compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) + +# from tvm.script import ir as I +# from tvm.script import tir as T + +@I.ir_module +class Module: + @T.prim_func + def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")): + T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)}) + blockIdx_x = T.launch_thread("blockIdx.x", 64) + conv2d_nchw = T.allocate([8], "float32", "local") + pad_temp_shared = T.allocate([4032], "float32", "shared") + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + conv2d_nchw_1[0] = T.float32(0.0) + conv2d_nchw_1[1] = T.float32(0.0) + conv2d_nchw_1[2] = T.float32(0.0) + conv2d_nchw_1[3] = T.float32(0.0) + conv2d_nchw_1[4] = T.float32(0.0) + conv2d_nchw_1[5] = T.float32(0.0) + conv2d_nchw_1[6] = T.float32(0.0) + conv2d_nchw_1[7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + cse_var_2: T.int32 = rc_outer_outer * 3136 + cse_var_1: T.int32 = rc_outer_outer * 576 + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data_1 = T.Buffer((25088,), data=data.data) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 14: + pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel_1 = T.Buffer((2359296,), data=kernel.data) + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 49] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 98] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 147] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 196] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 245] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 294] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 343] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 441] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 490] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 539] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 588] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 637] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 686] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 735] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 833] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 882] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 931] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 980] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1029] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1078] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1127] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1176] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1225] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1274] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1323] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1372] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1421] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1470] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_2, 49): + if threadIdx_x_2 < 17: + kernel_shared_1[threadIdx_x_2 + 1519] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + cse_var_3: T.int32 = rc_outer_inner * 24 + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 192] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 384] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 576] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 195] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 387] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 579] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 6] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 198] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 390] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 582] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 9] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 201] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 393] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 585] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 12] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 204] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 396] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 588] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 15] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 207] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 399] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 591] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 18] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 210] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 402] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 594] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 21] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 213] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 405] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 597] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 768] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 960] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1152] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1344] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 771] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 963] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1155] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1347] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 774] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 966] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1158] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1350] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 777] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 969] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1161] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1353] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 780] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 972] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1164] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1356] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 783] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 975] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1167] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1359] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 786] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 978] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1170] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1362] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 789] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 981] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1173] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1365] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 193] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 385] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 577] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 4] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 196] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 388] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 580] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 7] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 199] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 391] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 583] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 10] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 202] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 394] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 586] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 13] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 205] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 397] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 589] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 16] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 208] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 400] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 592] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 19] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 211] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 403] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 595] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 22] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 214] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 406] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 598] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 769] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 961] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1153] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1345] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 772] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 964] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1156] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1348] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 775] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 967] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1159] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1351] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 778] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 970] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1162] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1354] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 781] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 973] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1165] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1357] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 784] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 976] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1168] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1360] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 787] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 979] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1171] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1363] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 790] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 982] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1174] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1366] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 2] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 194] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 386] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 578] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 5] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 197] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 389] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 581] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 8] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 200] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 392] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 584] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 11] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 203] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 395] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 587] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 14] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 206] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 398] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 590] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 17] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 209] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 401] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 593] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 20] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 212] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 404] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 596] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 23] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 215] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 407] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 599] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 770] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 962] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1154] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1346] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 773] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 965] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1157] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1349] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 776] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 968] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1160] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1352] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 779] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 971] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1163] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1355] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 782] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 974] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1166] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1358] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 785] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 977] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1169] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1361] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 788] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 980] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1172] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1364] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 791] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 983] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1175] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1367] + for i1_inner in range(8): + compute_1 = T.Buffer((25088,), data=compute.data) + bias_1 = T.Buffer((512,), data=bias.data) + compute_1[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 8 + i1_inner], T.float32(0.0)) +Phase 1 +-------------------- +64 + +1 + +49 + +8 + +1024 + +1 + +0 + +2 + +0 + +4 + +T.float32(0.0) + +ff_outer_inner_init + +4 + +ff_outer_inner_init * 4 + +ff_inner_init + +ff_outer_inner_init * 4 + ff_inner_init + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +ff_outer_inner_init = T.int32() +ff_inner_init = T.int32() +conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) + +for ff_inner_init in range(4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + ff_outer_inner_init = T.int32() + conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) + +for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) + +0 + +8 + +0 + +3 + +4032 + +0 + +83 + +49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer + +7 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + +threadIdx_x + +7 + +threadIdx_x // 7 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 + +576 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576 + +T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576) + +49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x + +4032 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032 + +T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032) + +1 + +7 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + +7 + +threadIdx_x // 7 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 + +9 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 + +1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 + +7 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + +7 + +threadIdx_x // 7 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 + +9 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 + +8 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 + +1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 + +1 + +rx_outer_outer + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +rc_outer_outer + +3136 + +rc_outer_outer * 3136 + +7 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + +7 + +threadIdx_x // 7 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 + +9 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 + +49 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + +7 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + +7 + +threadIdx_x // 7 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 + +9 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 + +7 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.float32(0.0) + +T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +threadIdx_x = T.int32() +if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +threadIdx_x = T.int32() +if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1536 + +0 + +32 + +49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer + +49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + +threadIdx_x + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x + +1536 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536 + +T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536) + +blockIdx_x + +36864 + +blockIdx_x * 36864 + +49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x + +192 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 + +4608 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x + +192 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 + +3 + +(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + +49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + +ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() +threadIdx_x = T.int32() +if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + +for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + +0 + +8 + +0 + +3 + +0 + +0 + +8 + +0 + +ff_outer_inner + +4 + +ff_outer_inner * 4 + +ff_inner + +ff_outer_inner * 4 + ff_inner + +conv2d_nchw[ff_outer_inner * 4 + ff_inner] + +rc_outer_inner + +504 + +rc_outer_inner * 504 + +rc_inner + +63 + +rc_inner * 63 + +rc_outer_inner * 504 + rc_inner * 63 + +ry_outer_inner + +7 + +ry_outer_inner * 7 + +rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + +threadIdx_x + +rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] + +768 + +ff_outer_inner * 768 + +192 + +ff_inner * 192 + +ff_outer_inner * 768 + ff_inner * 192 + +24 + +rc_outer_inner * 24 + +ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + +3 + +rc_inner * 3 + +ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + +ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner + +kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +4 + +ff_outer_inner * 4 + +ff_outer_inner * 4 + ff_inner + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +ff_outer_inner = T.int32() +ff_inner = T.int32() +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +rc_inner = T.int32() +ry_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +for ff_inner in range(4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + ff_outer_inner = T.int32() + pad_temp_shared = T.Buffer((4032,), scope="shared") + rc_outer_inner = T.int32() + rc_inner = T.int32() + ry_outer_inner = T.int32() + threadIdx_x = T.int32() + kernel_shared = T.Buffer((1536,), scope="shared") + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +for rc_inner, ff_inner in T.grid(8, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + ff_outer_inner = T.int32() + pad_temp_shared = T.Buffer((4032,), scope="shared") + rc_outer_inner = T.int32() + ry_outer_inner = T.int32() + threadIdx_x = T.int32() + kernel_shared = T.Buffer((1536,), scope="shared") + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +for ff_outer_inner, rc_inner, ff_inner in T.grid(2, 8, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + pad_temp_shared = T.Buffer((4032,), scope="shared") + rc_outer_inner = T.int32() + ry_outer_inner = T.int32() + threadIdx_x = T.int32() + kernel_shared = T.Buffer((1536,), scope="shared") + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +for ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(3, 2, 8, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + pad_temp_shared = T.Buffer((4032,), scope="shared") + rc_outer_inner = T.int32() + threadIdx_x = T.int32() + kernel_shared = T.Buffer((1536,), scope="shared") + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + pad_temp_shared = T.Buffer((4032,), scope="shared") + threadIdx_x = T.int32() + kernel_shared = T.Buffer((1536,), scope="shared") + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +kernel_shared = T.Buffer((1536,), scope="shared") +for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] +for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + pad_temp_shared = T.Buffer((4032,), scope="shared") + threadIdx_x = T.int32() + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +T.bool(True) + +with T.allocate([1536], "float32", "shared") as kernel_shared: + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + pad_temp_shared = T.Buffer((4032,), scope="shared") + threadIdx_x = T.int32() + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +rc_outer_outer = T.int32() +for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + data = T.Buffer((25088,)) + pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +kernel_shared = T.allocate([1536], "float32", "shared") +kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") +for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] +for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + threadIdx_x = T.int32() + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +T.bool(True) + +with T.allocate([4032], "float32", "shared") as pad_temp_shared: + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + rx_outer_outer = T.int32() + rc_outer_outer = T.int32() + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + threadIdx_x = T.int32() + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +for rx_outer_outer in range(3): + pad_temp_shared = T.allocate([4032], "float32", "shared") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + rc_outer_outer = T.int32() + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + threadIdx_x = T.int32() + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.allocate([4032], "float32", "shared") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + threadIdx_x = T.int32() + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) +for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.allocate([4032], "float32", "shared") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + threadIdx_x = T.int32() + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +nn_outer_outer_outer_outer = T.int32() +with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.allocate([4032], "float32", "shared") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + threadIdx_x = T.int32() + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +nn_outer_outer_outer_outer = T.int32() +with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.allocate([4032], "float32", "shared") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + threadIdx_x = T.int32() + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + +0 + +8 + +i1_inner + +conv2d_nchw[i1_inner] + +8 + +blockIdx_x * 8 + +blockIdx_x * 8 + i1_inner + +bias[blockIdx_x * 8 + i1_inner] + +conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner] + +T.float32(0.0) + +T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +392 + +blockIdx_x * 392 + +49 + +i1_inner * 49 + +blockIdx_x * 392 + i1_inner * 49 + +blockIdx_x * 392 + i1_inner * 49 + threadIdx_x + +compute = T.Buffer((25088,)) +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +i1_inner = T.int32() +bias = T.Buffer((512,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +for i1_inner in range(8): + compute = T.Buffer((25088,)) + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + bias = T.Buffer((512,)) + blockIdx_x = T.int32() + threadIdx_x = T.int32() + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +nn_outer_outer_outer_outer = T.int32() +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.allocate([4032], "float32", "shared") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] +for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +T.bool(True) + +with T.allocate([8], "float32", "local") as conv2d_nchw: + nn_outer_outer_outer_outer = T.int32() + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + blockIdx_x = T.int32() + threadIdx_x = T.int32() + with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw_1[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.allocate([4032], "float32", "shared") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): + kernel = T.Buffer((2359296,)) + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] = conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + conv2d_nchw = T.allocate([8], "float32", "local") + nn_outer_outer_outer_outer = T.int32() + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + blockIdx_x = T.int32() + with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw_1[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.allocate([4032], "float32", "shared") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 1536): + kernel = T.Buffer((2359296,)) + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] = conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +with T.launch_thread("vthread", 1) as vthread: + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw = T.allocate([8], "float32", "local") + nn_outer_outer_outer_outer = T.int32() + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + blockIdx_x = T.int32() + with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw_1[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.allocate([4032], "float32", "shared") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 1536): + kernel = T.Buffer((2359296,)) + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] = conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +with T.launch_thread("blockIdx.x", 64) as blockIdx_x: + vthread = T.launch_thread("vthread", 1) + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw = T.allocate([8], "float32", "local") + nn_outer_outer_outer_outer = T.int32() + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): + T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) + for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): + conv2d_nchw_1[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + pad_temp_shared = T.allocate([4032], "float32", "shared") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7 < 576): + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 4032): + data = T.Buffer((25088,)) + pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + kernel_shared = T.allocate([1536], "float32", "shared") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): + threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) + if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 1536): + kernel = T.Buffer((2359296,)) + kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): + conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] = conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +# from tvm.script import ir as I +# from tvm.script import tir as T + +@I.ir_module +class Module: + @T.prim_func + def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")): + T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)}) + blockIdx_x = T.launch_thread("blockIdx.x", 64) + conv2d_nchw = T.allocate([8], "float32", "local") + pad_temp_shared = T.allocate([4032], "float32", "shared") + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + conv2d_nchw_1[0] = T.float32(0.0) + conv2d_nchw_1[1] = T.float32(0.0) + conv2d_nchw_1[2] = T.float32(0.0) + conv2d_nchw_1[3] = T.float32(0.0) + conv2d_nchw_1[4] = T.float32(0.0) + conv2d_nchw_1[5] = T.float32(0.0) + conv2d_nchw_1[6] = T.float32(0.0) + conv2d_nchw_1[7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + cse_var_2: T.int32 = rc_outer_outer * 3136 + cse_var_1: T.int32 = rc_outer_outer * 576 + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data_1 = T.Buffer((25088,), data=data.data) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 14: + pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel_1 = T.Buffer((2359296,), data=kernel.data) + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 49] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 98] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 147] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 196] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 245] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 294] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 343] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 441] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 490] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 539] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 588] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 637] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 686] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 735] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 833] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 882] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 931] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 980] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1029] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1078] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1127] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1176] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1225] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1274] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1323] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1372] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1421] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1470] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_2, 49): + if threadIdx_x_2 < 17: + kernel_shared_1[threadIdx_x_2 + 1519] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + cse_var_3: T.int32 = rc_outer_inner * 24 + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 192] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 384] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 576] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 195] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 387] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 579] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 6] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 198] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 390] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 582] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 9] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 201] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 393] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 585] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 12] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 204] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 396] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 588] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 15] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 207] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 399] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 591] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 18] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 210] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 402] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 594] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 21] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 213] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 405] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 597] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 768] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 960] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1152] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1344] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 771] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 963] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1155] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1347] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 774] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 966] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1158] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1350] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 777] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 969] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1161] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1353] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 780] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 972] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1164] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1356] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 783] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 975] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1167] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1359] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 786] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 978] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1170] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1362] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 789] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 981] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1173] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1365] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 193] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 385] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 577] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 4] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 196] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 388] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 580] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 7] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 199] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 391] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 583] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 10] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 202] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 394] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 586] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 13] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 205] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 397] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 589] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 16] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 208] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 400] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 592] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 19] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 211] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 403] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 595] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 22] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 214] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 406] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 598] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 769] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 961] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1153] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1345] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 772] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 964] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1156] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1348] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 775] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 967] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1159] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1351] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 778] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 970] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1162] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1354] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 781] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 973] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1165] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1357] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 784] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 976] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1168] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1360] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 787] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 979] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1171] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1363] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 790] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 982] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1174] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1366] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 2] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 194] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 386] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 578] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 5] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 197] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 389] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 581] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 8] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 200] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 392] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 584] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 11] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 203] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 395] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 587] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 14] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 206] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 398] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 590] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 17] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 209] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 401] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 593] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 20] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 212] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 404] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 596] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 23] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 215] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 407] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 599] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 770] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 962] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1154] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1346] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 773] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 965] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1157] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1349] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 776] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 968] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1160] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1352] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 779] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 971] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1163] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1355] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 782] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 974] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1166] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1358] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 785] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 977] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1169] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1361] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 788] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 980] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1172] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1364] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 791] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 983] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1175] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1367] + for i1_inner in range(8): + compute_1 = T.Buffer((25088,), data=compute.data) + bias_1 = T.Buffer((512,), data=bias.data) + compute_1[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 8 + i1_inner], T.float32(0.0)) +Phase 2 +-------------------- +64 + +8 + +4032 + +1536 + +49 + +T.float32(0.0) + +0 + +4 + +T.Mul(0, 4) + +0 + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) + +T.Mul(0, 4) + +1 + +T.Mul(0, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[T.Mul(0, 4) + 1] = T.float32(0.0) + +T.Mul(0, 4) + +2 + +T.Mul(0, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[T.Mul(0, 4) + 2] = T.float32(0.0) + +T.Mul(0, 4) + +3 + +T.Mul(0, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[T.Mul(0, 4) + 3] = T.float32(0.0) + +1 + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[T.Mul(1, 4) + 1] = T.float32(0.0) + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[T.Mul(1, 4) + 2] = T.float32(0.0) + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[T.Mul(1, 4) + 3] = T.float32(0.0) + +0 + +8 + +0 + +3 + +49 + +0 + +7 + +T.Mul(0, 7) + +threadIdx_x + +7 + +threadIdx_x // 7 + +T.Mul(0, 7) + threadIdx_x // 7 + +576 + +T.Mul(0, 7) + threadIdx_x // 7 < 576 + +49 + +T.Mul(0, 49) + +T.Mul(0, 49) + threadIdx_x + +4032 + +T.Mul(0, 49) + threadIdx_x < 4032 + +1 + +7 + +T.Mul(0, 7) + +7 + +threadIdx_x // 7 + +T.Mul(0, 7) + threadIdx_x // 7 + +9 + +(T.Mul(0, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 + +7 + +T.Mul(0, 7) + +7 + +threadIdx_x // 7 + +T.Mul(0, 7) + threadIdx_x // 7 + +9 + +(T.Mul(0, 7) + threadIdx_x // 7) % 9 + +8 + +(T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 + +1 + +rx_outer_outer + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +rc_outer_outer + +3136 + +rc_outer_outer * 3136 + +7 + +T.Mul(0, 7) + +7 + +threadIdx_x // 7 + +T.Mul(0, 7) + threadIdx_x // 7 + +9 + +(T.Mul(0, 7) + threadIdx_x // 7) // 9 + +49 + +(T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + +7 + +T.Mul(0, 7) + +7 + +threadIdx_x // 7 + +T.Mul(0, 7) + threadIdx_x // 7 + +9 + +(T.Mul(0, 7) + threadIdx_x // 7) % 9 + +7 + +(T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.float32(0.0) + +T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +49 + +T.Mul(0, 49) + +T.Mul(0, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(0, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(0, 7) + threadIdx_x // 7 < 576: + if T.Mul(0, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(0, 7) + threadIdx_x // 7 < 576: + if T.Mul(0, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +T.Mul(1, 7) + +T.Mul(1, 7) + threadIdx_x // 7 + +T.Mul(1, 7) + threadIdx_x // 7 < 576 + +T.Mul(1, 49) + +T.Mul(1, 49) + threadIdx_x + +T.Mul(1, 49) + threadIdx_x < 4032 + +T.Mul(1, 7) + +T.Mul(1, 7) + threadIdx_x // 7 + +(T.Mul(1, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 + +T.Mul(1, 7) + +T.Mul(1, 7) + threadIdx_x // 7 + +(T.Mul(1, 7) + threadIdx_x // 7) % 9 + +(T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(1, 7) + +T.Mul(1, 7) + threadIdx_x // 7 + +(T.Mul(1, 7) + threadIdx_x // 7) // 9 + +(T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(1, 7) + +T.Mul(1, 7) + threadIdx_x // 7 + +(T.Mul(1, 7) + threadIdx_x // 7) % 9 + +(T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(1, 49) + +T.Mul(1, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(1, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(1, 7) + threadIdx_x // 7 < 576: + if T.Mul(1, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(1, 7) + threadIdx_x // 7 < 576: + if T.Mul(1, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2 + +T.Mul(2, 7) + +T.Mul(2, 7) + threadIdx_x // 7 + +T.Mul(2, 7) + threadIdx_x // 7 < 576 + +T.Mul(2, 49) + +T.Mul(2, 49) + threadIdx_x + +T.Mul(2, 49) + threadIdx_x < 4032 + +T.Mul(2, 7) + +T.Mul(2, 7) + threadIdx_x // 7 + +(T.Mul(2, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 + +T.Mul(2, 7) + +T.Mul(2, 7) + threadIdx_x // 7 + +(T.Mul(2, 7) + threadIdx_x // 7) % 9 + +(T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(2, 7) + +T.Mul(2, 7) + threadIdx_x // 7 + +(T.Mul(2, 7) + threadIdx_x // 7) // 9 + +(T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(2, 7) + +T.Mul(2, 7) + threadIdx_x // 7 + +(T.Mul(2, 7) + threadIdx_x // 7) % 9 + +(T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(2, 49) + +T.Mul(2, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(2, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(2, 7) + threadIdx_x // 7 < 576: + if T.Mul(2, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(2, 7) + threadIdx_x // 7 < 576: + if T.Mul(2, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3 + +T.Mul(3, 7) + +T.Mul(3, 7) + threadIdx_x // 7 + +T.Mul(3, 7) + threadIdx_x // 7 < 576 + +T.Mul(3, 49) + +T.Mul(3, 49) + threadIdx_x + +T.Mul(3, 49) + threadIdx_x < 4032 + +T.Mul(3, 7) + +T.Mul(3, 7) + threadIdx_x // 7 + +(T.Mul(3, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 + +T.Mul(3, 7) + +T.Mul(3, 7) + threadIdx_x // 7 + +(T.Mul(3, 7) + threadIdx_x // 7) % 9 + +(T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(3, 7) + +T.Mul(3, 7) + threadIdx_x // 7 + +(T.Mul(3, 7) + threadIdx_x // 7) // 9 + +(T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(3, 7) + +T.Mul(3, 7) + threadIdx_x // 7 + +(T.Mul(3, 7) + threadIdx_x // 7) % 9 + +(T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(3, 49) + +T.Mul(3, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(3, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(3, 7) + threadIdx_x // 7 < 576: + if T.Mul(3, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(3, 7) + threadIdx_x // 7 < 576: + if T.Mul(3, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +4 + +T.Mul(4, 7) + +T.Mul(4, 7) + threadIdx_x // 7 + +T.Mul(4, 7) + threadIdx_x // 7 < 576 + +T.Mul(4, 49) + +T.Mul(4, 49) + threadIdx_x + +T.Mul(4, 49) + threadIdx_x < 4032 + +T.Mul(4, 7) + +T.Mul(4, 7) + threadIdx_x // 7 + +(T.Mul(4, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 + +T.Mul(4, 7) + +T.Mul(4, 7) + threadIdx_x // 7 + +(T.Mul(4, 7) + threadIdx_x // 7) % 9 + +(T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(4, 7) + +T.Mul(4, 7) + threadIdx_x // 7 + +(T.Mul(4, 7) + threadIdx_x // 7) // 9 + +(T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(4, 7) + +T.Mul(4, 7) + threadIdx_x // 7 + +(T.Mul(4, 7) + threadIdx_x // 7) % 9 + +(T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(4, 49) + +T.Mul(4, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(4, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(4, 7) + threadIdx_x // 7 < 576: + if T.Mul(4, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(4, 7) + threadIdx_x // 7 < 576: + if T.Mul(4, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +5 + +T.Mul(5, 7) + +T.Mul(5, 7) + threadIdx_x // 7 + +T.Mul(5, 7) + threadIdx_x // 7 < 576 + +T.Mul(5, 49) + +T.Mul(5, 49) + threadIdx_x + +T.Mul(5, 49) + threadIdx_x < 4032 + +T.Mul(5, 7) + +T.Mul(5, 7) + threadIdx_x // 7 + +(T.Mul(5, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 + +T.Mul(5, 7) + +T.Mul(5, 7) + threadIdx_x // 7 + +(T.Mul(5, 7) + threadIdx_x // 7) % 9 + +(T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(5, 7) + +T.Mul(5, 7) + threadIdx_x // 7 + +(T.Mul(5, 7) + threadIdx_x // 7) // 9 + +(T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(5, 7) + +T.Mul(5, 7) + threadIdx_x // 7 + +(T.Mul(5, 7) + threadIdx_x // 7) % 9 + +(T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(5, 49) + +T.Mul(5, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(5, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(5, 7) + threadIdx_x // 7 < 576: + if T.Mul(5, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(5, 7) + threadIdx_x // 7 < 576: + if T.Mul(5, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +6 + +T.Mul(6, 7) + +T.Mul(6, 7) + threadIdx_x // 7 + +T.Mul(6, 7) + threadIdx_x // 7 < 576 + +T.Mul(6, 49) + +T.Mul(6, 49) + threadIdx_x + +T.Mul(6, 49) + threadIdx_x < 4032 + +T.Mul(6, 7) + +T.Mul(6, 7) + threadIdx_x // 7 + +(T.Mul(6, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 + +T.Mul(6, 7) + +T.Mul(6, 7) + threadIdx_x // 7 + +(T.Mul(6, 7) + threadIdx_x // 7) % 9 + +(T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(6, 7) + +T.Mul(6, 7) + threadIdx_x // 7 + +(T.Mul(6, 7) + threadIdx_x // 7) // 9 + +(T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(6, 7) + +T.Mul(6, 7) + threadIdx_x // 7 + +(T.Mul(6, 7) + threadIdx_x // 7) % 9 + +(T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(6, 49) + +T.Mul(6, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(6, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(6, 7) + threadIdx_x // 7 < 576: + if T.Mul(6, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(6, 7) + threadIdx_x // 7 < 576: + if T.Mul(6, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +7 + +T.Mul(7, 7) + +T.Mul(7, 7) + threadIdx_x // 7 + +T.Mul(7, 7) + threadIdx_x // 7 < 576 + +T.Mul(7, 49) + +T.Mul(7, 49) + threadIdx_x + +T.Mul(7, 49) + threadIdx_x < 4032 + +T.Mul(7, 7) + +T.Mul(7, 7) + threadIdx_x // 7 + +(T.Mul(7, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 + +T.Mul(7, 7) + +T.Mul(7, 7) + threadIdx_x // 7 + +(T.Mul(7, 7) + threadIdx_x // 7) % 9 + +(T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(7, 7) + +T.Mul(7, 7) + threadIdx_x // 7 + +(T.Mul(7, 7) + threadIdx_x // 7) // 9 + +(T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(7, 7) + +T.Mul(7, 7) + threadIdx_x // 7 + +(T.Mul(7, 7) + threadIdx_x // 7) % 9 + +(T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(7, 49) + +T.Mul(7, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(7, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(7, 7) + threadIdx_x // 7 < 576: + if T.Mul(7, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(7, 7) + threadIdx_x // 7 < 576: + if T.Mul(7, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +8 + +T.Mul(8, 7) + +T.Mul(8, 7) + threadIdx_x // 7 + +T.Mul(8, 7) + threadIdx_x // 7 < 576 + +T.Mul(8, 49) + +T.Mul(8, 49) + threadIdx_x + +T.Mul(8, 49) + threadIdx_x < 4032 + +T.Mul(8, 7) + +T.Mul(8, 7) + threadIdx_x // 7 + +(T.Mul(8, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 + +T.Mul(8, 7) + +T.Mul(8, 7) + threadIdx_x // 7 + +(T.Mul(8, 7) + threadIdx_x // 7) % 9 + +(T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(8, 7) + +T.Mul(8, 7) + threadIdx_x // 7 + +(T.Mul(8, 7) + threadIdx_x // 7) // 9 + +(T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(8, 7) + +T.Mul(8, 7) + threadIdx_x // 7 + +(T.Mul(8, 7) + threadIdx_x // 7) % 9 + +(T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(8, 49) + +T.Mul(8, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(8, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(8, 7) + threadIdx_x // 7 < 576: + if T.Mul(8, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(8, 7) + threadIdx_x // 7 < 576: + if T.Mul(8, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +9 + +T.Mul(9, 7) + +T.Mul(9, 7) + threadIdx_x // 7 + +T.Mul(9, 7) + threadIdx_x // 7 < 576 + +T.Mul(9, 49) + +T.Mul(9, 49) + threadIdx_x + +T.Mul(9, 49) + threadIdx_x < 4032 + +T.Mul(9, 7) + +T.Mul(9, 7) + threadIdx_x // 7 + +(T.Mul(9, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 + +T.Mul(9, 7) + +T.Mul(9, 7) + threadIdx_x // 7 + +(T.Mul(9, 7) + threadIdx_x // 7) % 9 + +(T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(9, 7) + +T.Mul(9, 7) + threadIdx_x // 7 + +(T.Mul(9, 7) + threadIdx_x // 7) // 9 + +(T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(9, 7) + +T.Mul(9, 7) + threadIdx_x // 7 + +(T.Mul(9, 7) + threadIdx_x // 7) % 9 + +(T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(9, 49) + +T.Mul(9, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(9, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(9, 7) + threadIdx_x // 7 < 576: + if T.Mul(9, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(9, 7) + threadIdx_x // 7 < 576: + if T.Mul(9, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +10 + +T.Mul(10, 7) + +T.Mul(10, 7) + threadIdx_x // 7 + +T.Mul(10, 7) + threadIdx_x // 7 < 576 + +T.Mul(10, 49) + +T.Mul(10, 49) + threadIdx_x + +T.Mul(10, 49) + threadIdx_x < 4032 + +T.Mul(10, 7) + +T.Mul(10, 7) + threadIdx_x // 7 + +(T.Mul(10, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 + +T.Mul(10, 7) + +T.Mul(10, 7) + threadIdx_x // 7 + +(T.Mul(10, 7) + threadIdx_x // 7) % 9 + +(T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(10, 7) + +T.Mul(10, 7) + threadIdx_x // 7 + +(T.Mul(10, 7) + threadIdx_x // 7) // 9 + +(T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(10, 7) + +T.Mul(10, 7) + threadIdx_x // 7 + +(T.Mul(10, 7) + threadIdx_x // 7) % 9 + +(T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(10, 49) + +T.Mul(10, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(10, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(10, 7) + threadIdx_x // 7 < 576: + if T.Mul(10, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(10, 7) + threadIdx_x // 7 < 576: + if T.Mul(10, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +11 + +T.Mul(11, 7) + +T.Mul(11, 7) + threadIdx_x // 7 + +T.Mul(11, 7) + threadIdx_x // 7 < 576 + +T.Mul(11, 49) + +T.Mul(11, 49) + threadIdx_x + +T.Mul(11, 49) + threadIdx_x < 4032 + +T.Mul(11, 7) + +T.Mul(11, 7) + threadIdx_x // 7 + +(T.Mul(11, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 + +T.Mul(11, 7) + +T.Mul(11, 7) + threadIdx_x // 7 + +(T.Mul(11, 7) + threadIdx_x // 7) % 9 + +(T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(11, 7) + +T.Mul(11, 7) + threadIdx_x // 7 + +(T.Mul(11, 7) + threadIdx_x // 7) // 9 + +(T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(11, 7) + +T.Mul(11, 7) + threadIdx_x // 7 + +(T.Mul(11, 7) + threadIdx_x // 7) % 9 + +(T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(11, 49) + +T.Mul(11, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(11, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(11, 7) + threadIdx_x // 7 < 576: + if T.Mul(11, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(11, 7) + threadIdx_x // 7 < 576: + if T.Mul(11, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +12 + +T.Mul(12, 7) + +T.Mul(12, 7) + threadIdx_x // 7 + +T.Mul(12, 7) + threadIdx_x // 7 < 576 + +T.Mul(12, 49) + +T.Mul(12, 49) + threadIdx_x + +T.Mul(12, 49) + threadIdx_x < 4032 + +T.Mul(12, 7) + +T.Mul(12, 7) + threadIdx_x // 7 + +(T.Mul(12, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 + +T.Mul(12, 7) + +T.Mul(12, 7) + threadIdx_x // 7 + +(T.Mul(12, 7) + threadIdx_x // 7) % 9 + +(T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(12, 7) + +T.Mul(12, 7) + threadIdx_x // 7 + +(T.Mul(12, 7) + threadIdx_x // 7) // 9 + +(T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(12, 7) + +T.Mul(12, 7) + threadIdx_x // 7 + +(T.Mul(12, 7) + threadIdx_x // 7) % 9 + +(T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(12, 49) + +T.Mul(12, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(12, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(12, 7) + threadIdx_x // 7 < 576: + if T.Mul(12, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(12, 7) + threadIdx_x // 7 < 576: + if T.Mul(12, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +13 + +T.Mul(13, 7) + +T.Mul(13, 7) + threadIdx_x // 7 + +T.Mul(13, 7) + threadIdx_x // 7 < 576 + +T.Mul(13, 49) + +T.Mul(13, 49) + threadIdx_x + +T.Mul(13, 49) + threadIdx_x < 4032 + +T.Mul(13, 7) + +T.Mul(13, 7) + threadIdx_x // 7 + +(T.Mul(13, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 + +T.Mul(13, 7) + +T.Mul(13, 7) + threadIdx_x // 7 + +(T.Mul(13, 7) + threadIdx_x // 7) % 9 + +(T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(13, 7) + +T.Mul(13, 7) + threadIdx_x // 7 + +(T.Mul(13, 7) + threadIdx_x // 7) // 9 + +(T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(13, 7) + +T.Mul(13, 7) + threadIdx_x // 7 + +(T.Mul(13, 7) + threadIdx_x // 7) % 9 + +(T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(13, 49) + +T.Mul(13, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(13, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(13, 7) + threadIdx_x // 7 < 576: + if T.Mul(13, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(13, 7) + threadIdx_x // 7 < 576: + if T.Mul(13, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +14 + +T.Mul(14, 7) + +T.Mul(14, 7) + threadIdx_x // 7 + +T.Mul(14, 7) + threadIdx_x // 7 < 576 + +T.Mul(14, 49) + +T.Mul(14, 49) + threadIdx_x + +T.Mul(14, 49) + threadIdx_x < 4032 + +T.Mul(14, 7) + +T.Mul(14, 7) + threadIdx_x // 7 + +(T.Mul(14, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 + +T.Mul(14, 7) + +T.Mul(14, 7) + threadIdx_x // 7 + +(T.Mul(14, 7) + threadIdx_x // 7) % 9 + +(T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(14, 7) + +T.Mul(14, 7) + threadIdx_x // 7 + +(T.Mul(14, 7) + threadIdx_x // 7) // 9 + +(T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(14, 7) + +T.Mul(14, 7) + threadIdx_x // 7 + +(T.Mul(14, 7) + threadIdx_x // 7) % 9 + +(T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(14, 49) + +T.Mul(14, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(14, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(14, 7) + threadIdx_x // 7 < 576: + if T.Mul(14, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(14, 7) + threadIdx_x // 7 < 576: + if T.Mul(14, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +15 + +T.Mul(15, 7) + +T.Mul(15, 7) + threadIdx_x // 7 + +T.Mul(15, 7) + threadIdx_x // 7 < 576 + +T.Mul(15, 49) + +T.Mul(15, 49) + threadIdx_x + +T.Mul(15, 49) + threadIdx_x < 4032 + +T.Mul(15, 7) + +T.Mul(15, 7) + threadIdx_x // 7 + +(T.Mul(15, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 + +T.Mul(15, 7) + +T.Mul(15, 7) + threadIdx_x // 7 + +(T.Mul(15, 7) + threadIdx_x // 7) % 9 + +(T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(15, 7) + +T.Mul(15, 7) + threadIdx_x // 7 + +(T.Mul(15, 7) + threadIdx_x // 7) // 9 + +(T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(15, 7) + +T.Mul(15, 7) + threadIdx_x // 7 + +(T.Mul(15, 7) + threadIdx_x // 7) % 9 + +(T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(15, 49) + +T.Mul(15, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(15, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(15, 7) + threadIdx_x // 7 < 576: + if T.Mul(15, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(15, 7) + threadIdx_x // 7 < 576: + if T.Mul(15, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +16 + +T.Mul(16, 7) + +T.Mul(16, 7) + threadIdx_x // 7 + +T.Mul(16, 7) + threadIdx_x // 7 < 576 + +T.Mul(16, 49) + +T.Mul(16, 49) + threadIdx_x + +T.Mul(16, 49) + threadIdx_x < 4032 + +T.Mul(16, 7) + +T.Mul(16, 7) + threadIdx_x // 7 + +(T.Mul(16, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 + +T.Mul(16, 7) + +T.Mul(16, 7) + threadIdx_x // 7 + +(T.Mul(16, 7) + threadIdx_x // 7) % 9 + +(T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(16, 7) + +T.Mul(16, 7) + threadIdx_x // 7 + +(T.Mul(16, 7) + threadIdx_x // 7) // 9 + +(T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(16, 7) + +T.Mul(16, 7) + threadIdx_x // 7 + +(T.Mul(16, 7) + threadIdx_x // 7) % 9 + +(T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(16, 49) + +T.Mul(16, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(16, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(16, 7) + threadIdx_x // 7 < 576: + if T.Mul(16, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(16, 7) + threadIdx_x // 7 < 576: + if T.Mul(16, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +17 + +T.Mul(17, 7) + +T.Mul(17, 7) + threadIdx_x // 7 + +T.Mul(17, 7) + threadIdx_x // 7 < 576 + +T.Mul(17, 49) + +T.Mul(17, 49) + threadIdx_x + +T.Mul(17, 49) + threadIdx_x < 4032 + +T.Mul(17, 7) + +T.Mul(17, 7) + threadIdx_x // 7 + +(T.Mul(17, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 + +T.Mul(17, 7) + +T.Mul(17, 7) + threadIdx_x // 7 + +(T.Mul(17, 7) + threadIdx_x // 7) % 9 + +(T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(17, 7) + +T.Mul(17, 7) + threadIdx_x // 7 + +(T.Mul(17, 7) + threadIdx_x // 7) // 9 + +(T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(17, 7) + +T.Mul(17, 7) + threadIdx_x // 7 + +(T.Mul(17, 7) + threadIdx_x // 7) % 9 + +(T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(17, 49) + +T.Mul(17, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(17, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(17, 7) + threadIdx_x // 7 < 576: + if T.Mul(17, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(17, 7) + threadIdx_x // 7 < 576: + if T.Mul(17, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +18 + +T.Mul(18, 7) + +T.Mul(18, 7) + threadIdx_x // 7 + +T.Mul(18, 7) + threadIdx_x // 7 < 576 + +T.Mul(18, 49) + +T.Mul(18, 49) + threadIdx_x + +T.Mul(18, 49) + threadIdx_x < 4032 + +T.Mul(18, 7) + +T.Mul(18, 7) + threadIdx_x // 7 + +(T.Mul(18, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 + +T.Mul(18, 7) + +T.Mul(18, 7) + threadIdx_x // 7 + +(T.Mul(18, 7) + threadIdx_x // 7) % 9 + +(T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(18, 7) + +T.Mul(18, 7) + threadIdx_x // 7 + +(T.Mul(18, 7) + threadIdx_x // 7) // 9 + +(T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(18, 7) + +T.Mul(18, 7) + threadIdx_x // 7 + +(T.Mul(18, 7) + threadIdx_x // 7) % 9 + +(T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(18, 49) + +T.Mul(18, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(18, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(18, 7) + threadIdx_x // 7 < 576: + if T.Mul(18, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(18, 7) + threadIdx_x // 7 < 576: + if T.Mul(18, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +19 + +T.Mul(19, 7) + +T.Mul(19, 7) + threadIdx_x // 7 + +T.Mul(19, 7) + threadIdx_x // 7 < 576 + +T.Mul(19, 49) + +T.Mul(19, 49) + threadIdx_x + +T.Mul(19, 49) + threadIdx_x < 4032 + +T.Mul(19, 7) + +T.Mul(19, 7) + threadIdx_x // 7 + +(T.Mul(19, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 + +T.Mul(19, 7) + +T.Mul(19, 7) + threadIdx_x // 7 + +(T.Mul(19, 7) + threadIdx_x // 7) % 9 + +(T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(19, 7) + +T.Mul(19, 7) + threadIdx_x // 7 + +(T.Mul(19, 7) + threadIdx_x // 7) // 9 + +(T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(19, 7) + +T.Mul(19, 7) + threadIdx_x // 7 + +(T.Mul(19, 7) + threadIdx_x // 7) % 9 + +(T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(19, 49) + +T.Mul(19, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(19, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(19, 7) + threadIdx_x // 7 < 576: + if T.Mul(19, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(19, 7) + threadIdx_x // 7 < 576: + if T.Mul(19, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +20 + +T.Mul(20, 7) + +T.Mul(20, 7) + threadIdx_x // 7 + +T.Mul(20, 7) + threadIdx_x // 7 < 576 + +T.Mul(20, 49) + +T.Mul(20, 49) + threadIdx_x + +T.Mul(20, 49) + threadIdx_x < 4032 + +T.Mul(20, 7) + +T.Mul(20, 7) + threadIdx_x // 7 + +(T.Mul(20, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 + +T.Mul(20, 7) + +T.Mul(20, 7) + threadIdx_x // 7 + +(T.Mul(20, 7) + threadIdx_x // 7) % 9 + +(T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(20, 7) + +T.Mul(20, 7) + threadIdx_x // 7 + +(T.Mul(20, 7) + threadIdx_x // 7) // 9 + +(T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(20, 7) + +T.Mul(20, 7) + threadIdx_x // 7 + +(T.Mul(20, 7) + threadIdx_x // 7) % 9 + +(T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(20, 49) + +T.Mul(20, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(20, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(20, 7) + threadIdx_x // 7 < 576: + if T.Mul(20, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(20, 7) + threadIdx_x // 7 < 576: + if T.Mul(20, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +21 + +T.Mul(21, 7) + +T.Mul(21, 7) + threadIdx_x // 7 + +T.Mul(21, 7) + threadIdx_x // 7 < 576 + +T.Mul(21, 49) + +T.Mul(21, 49) + threadIdx_x + +T.Mul(21, 49) + threadIdx_x < 4032 + +T.Mul(21, 7) + +T.Mul(21, 7) + threadIdx_x // 7 + +(T.Mul(21, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 + +T.Mul(21, 7) + +T.Mul(21, 7) + threadIdx_x // 7 + +(T.Mul(21, 7) + threadIdx_x // 7) % 9 + +(T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(21, 7) + +T.Mul(21, 7) + threadIdx_x // 7 + +(T.Mul(21, 7) + threadIdx_x // 7) // 9 + +(T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(21, 7) + +T.Mul(21, 7) + threadIdx_x // 7 + +(T.Mul(21, 7) + threadIdx_x // 7) % 9 + +(T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(21, 49) + +T.Mul(21, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(21, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(21, 7) + threadIdx_x // 7 < 576: + if T.Mul(21, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(21, 7) + threadIdx_x // 7 < 576: + if T.Mul(21, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +22 + +T.Mul(22, 7) + +T.Mul(22, 7) + threadIdx_x // 7 + +T.Mul(22, 7) + threadIdx_x // 7 < 576 + +T.Mul(22, 49) + +T.Mul(22, 49) + threadIdx_x + +T.Mul(22, 49) + threadIdx_x < 4032 + +T.Mul(22, 7) + +T.Mul(22, 7) + threadIdx_x // 7 + +(T.Mul(22, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 + +T.Mul(22, 7) + +T.Mul(22, 7) + threadIdx_x // 7 + +(T.Mul(22, 7) + threadIdx_x // 7) % 9 + +(T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(22, 7) + +T.Mul(22, 7) + threadIdx_x // 7 + +(T.Mul(22, 7) + threadIdx_x // 7) // 9 + +(T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(22, 7) + +T.Mul(22, 7) + threadIdx_x // 7 + +(T.Mul(22, 7) + threadIdx_x // 7) % 9 + +(T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(22, 49) + +T.Mul(22, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(22, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(22, 7) + threadIdx_x // 7 < 576: + if T.Mul(22, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(22, 7) + threadIdx_x // 7 < 576: + if T.Mul(22, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +23 + +T.Mul(23, 7) + +T.Mul(23, 7) + threadIdx_x // 7 + +T.Mul(23, 7) + threadIdx_x // 7 < 576 + +T.Mul(23, 49) + +T.Mul(23, 49) + threadIdx_x + +T.Mul(23, 49) + threadIdx_x < 4032 + +T.Mul(23, 7) + +T.Mul(23, 7) + threadIdx_x // 7 + +(T.Mul(23, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 + +T.Mul(23, 7) + +T.Mul(23, 7) + threadIdx_x // 7 + +(T.Mul(23, 7) + threadIdx_x // 7) % 9 + +(T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(23, 7) + +T.Mul(23, 7) + threadIdx_x // 7 + +(T.Mul(23, 7) + threadIdx_x // 7) // 9 + +(T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(23, 7) + +T.Mul(23, 7) + threadIdx_x // 7 + +(T.Mul(23, 7) + threadIdx_x // 7) % 9 + +(T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(23, 49) + +T.Mul(23, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(23, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(23, 7) + threadIdx_x // 7 < 576: + if T.Mul(23, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(23, 7) + threadIdx_x // 7 < 576: + if T.Mul(23, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +24 + +T.Mul(24, 7) + +T.Mul(24, 7) + threadIdx_x // 7 + +T.Mul(24, 7) + threadIdx_x // 7 < 576 + +T.Mul(24, 49) + +T.Mul(24, 49) + threadIdx_x + +T.Mul(24, 49) + threadIdx_x < 4032 + +T.Mul(24, 7) + +T.Mul(24, 7) + threadIdx_x // 7 + +(T.Mul(24, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 + +T.Mul(24, 7) + +T.Mul(24, 7) + threadIdx_x // 7 + +(T.Mul(24, 7) + threadIdx_x // 7) % 9 + +(T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(24, 7) + +T.Mul(24, 7) + threadIdx_x // 7 + +(T.Mul(24, 7) + threadIdx_x // 7) // 9 + +(T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(24, 7) + +T.Mul(24, 7) + threadIdx_x // 7 + +(T.Mul(24, 7) + threadIdx_x // 7) % 9 + +(T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(24, 49) + +T.Mul(24, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(24, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(24, 7) + threadIdx_x // 7 < 576: + if T.Mul(24, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(24, 7) + threadIdx_x // 7 < 576: + if T.Mul(24, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +25 + +T.Mul(25, 7) + +T.Mul(25, 7) + threadIdx_x // 7 + +T.Mul(25, 7) + threadIdx_x // 7 < 576 + +T.Mul(25, 49) + +T.Mul(25, 49) + threadIdx_x + +T.Mul(25, 49) + threadIdx_x < 4032 + +T.Mul(25, 7) + +T.Mul(25, 7) + threadIdx_x // 7 + +(T.Mul(25, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 + +T.Mul(25, 7) + +T.Mul(25, 7) + threadIdx_x // 7 + +(T.Mul(25, 7) + threadIdx_x // 7) % 9 + +(T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(25, 7) + +T.Mul(25, 7) + threadIdx_x // 7 + +(T.Mul(25, 7) + threadIdx_x // 7) // 9 + +(T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(25, 7) + +T.Mul(25, 7) + threadIdx_x // 7 + +(T.Mul(25, 7) + threadIdx_x // 7) % 9 + +(T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(25, 49) + +T.Mul(25, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(25, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(25, 7) + threadIdx_x // 7 < 576: + if T.Mul(25, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(25, 7) + threadIdx_x // 7 < 576: + if T.Mul(25, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +26 + +T.Mul(26, 7) + +T.Mul(26, 7) + threadIdx_x // 7 + +T.Mul(26, 7) + threadIdx_x // 7 < 576 + +T.Mul(26, 49) + +T.Mul(26, 49) + threadIdx_x + +T.Mul(26, 49) + threadIdx_x < 4032 + +T.Mul(26, 7) + +T.Mul(26, 7) + threadIdx_x // 7 + +(T.Mul(26, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 + +T.Mul(26, 7) + +T.Mul(26, 7) + threadIdx_x // 7 + +(T.Mul(26, 7) + threadIdx_x // 7) % 9 + +(T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(26, 7) + +T.Mul(26, 7) + threadIdx_x // 7 + +(T.Mul(26, 7) + threadIdx_x // 7) // 9 + +(T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(26, 7) + +T.Mul(26, 7) + threadIdx_x // 7 + +(T.Mul(26, 7) + threadIdx_x // 7) % 9 + +(T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(26, 49) + +T.Mul(26, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(26, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(26, 7) + threadIdx_x // 7 < 576: + if T.Mul(26, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(26, 7) + threadIdx_x // 7 < 576: + if T.Mul(26, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +27 + +T.Mul(27, 7) + +T.Mul(27, 7) + threadIdx_x // 7 + +T.Mul(27, 7) + threadIdx_x // 7 < 576 + +T.Mul(27, 49) + +T.Mul(27, 49) + threadIdx_x + +T.Mul(27, 49) + threadIdx_x < 4032 + +T.Mul(27, 7) + +T.Mul(27, 7) + threadIdx_x // 7 + +(T.Mul(27, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 + +T.Mul(27, 7) + +T.Mul(27, 7) + threadIdx_x // 7 + +(T.Mul(27, 7) + threadIdx_x // 7) % 9 + +(T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(27, 7) + +T.Mul(27, 7) + threadIdx_x // 7 + +(T.Mul(27, 7) + threadIdx_x // 7) // 9 + +(T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(27, 7) + +T.Mul(27, 7) + threadIdx_x // 7 + +(T.Mul(27, 7) + threadIdx_x // 7) % 9 + +(T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(27, 49) + +T.Mul(27, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(27, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(27, 7) + threadIdx_x // 7 < 576: + if T.Mul(27, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(27, 7) + threadIdx_x // 7 < 576: + if T.Mul(27, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +28 + +T.Mul(28, 7) + +T.Mul(28, 7) + threadIdx_x // 7 + +T.Mul(28, 7) + threadIdx_x // 7 < 576 + +T.Mul(28, 49) + +T.Mul(28, 49) + threadIdx_x + +T.Mul(28, 49) + threadIdx_x < 4032 + +T.Mul(28, 7) + +T.Mul(28, 7) + threadIdx_x // 7 + +(T.Mul(28, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 + +T.Mul(28, 7) + +T.Mul(28, 7) + threadIdx_x // 7 + +(T.Mul(28, 7) + threadIdx_x // 7) % 9 + +(T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(28, 7) + +T.Mul(28, 7) + threadIdx_x // 7 + +(T.Mul(28, 7) + threadIdx_x // 7) // 9 + +(T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(28, 7) + +T.Mul(28, 7) + threadIdx_x // 7 + +(T.Mul(28, 7) + threadIdx_x // 7) % 9 + +(T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(28, 49) + +T.Mul(28, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(28, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(28, 7) + threadIdx_x // 7 < 576: + if T.Mul(28, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(28, 7) + threadIdx_x // 7 < 576: + if T.Mul(28, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +29 + +T.Mul(29, 7) + +T.Mul(29, 7) + threadIdx_x // 7 + +T.Mul(29, 7) + threadIdx_x // 7 < 576 + +T.Mul(29, 49) + +T.Mul(29, 49) + threadIdx_x + +T.Mul(29, 49) + threadIdx_x < 4032 + +T.Mul(29, 7) + +T.Mul(29, 7) + threadIdx_x // 7 + +(T.Mul(29, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 + +T.Mul(29, 7) + +T.Mul(29, 7) + threadIdx_x // 7 + +(T.Mul(29, 7) + threadIdx_x // 7) % 9 + +(T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(29, 7) + +T.Mul(29, 7) + threadIdx_x // 7 + +(T.Mul(29, 7) + threadIdx_x // 7) // 9 + +(T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(29, 7) + +T.Mul(29, 7) + threadIdx_x // 7 + +(T.Mul(29, 7) + threadIdx_x // 7) % 9 + +(T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(29, 49) + +T.Mul(29, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(29, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(29, 7) + threadIdx_x // 7 < 576: + if T.Mul(29, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(29, 7) + threadIdx_x // 7 < 576: + if T.Mul(29, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +30 + +T.Mul(30, 7) + +T.Mul(30, 7) + threadIdx_x // 7 + +T.Mul(30, 7) + threadIdx_x // 7 < 576 + +T.Mul(30, 49) + +T.Mul(30, 49) + threadIdx_x + +T.Mul(30, 49) + threadIdx_x < 4032 + +T.Mul(30, 7) + +T.Mul(30, 7) + threadIdx_x // 7 + +(T.Mul(30, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 + +T.Mul(30, 7) + +T.Mul(30, 7) + threadIdx_x // 7 + +(T.Mul(30, 7) + threadIdx_x // 7) % 9 + +(T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(30, 7) + +T.Mul(30, 7) + threadIdx_x // 7 + +(T.Mul(30, 7) + threadIdx_x // 7) // 9 + +(T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(30, 7) + +T.Mul(30, 7) + threadIdx_x // 7 + +(T.Mul(30, 7) + threadIdx_x // 7) % 9 + +(T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(30, 49) + +T.Mul(30, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(30, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(30, 7) + threadIdx_x // 7 < 576: + if T.Mul(30, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(30, 7) + threadIdx_x // 7 < 576: + if T.Mul(30, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +31 + +T.Mul(31, 7) + +T.Mul(31, 7) + threadIdx_x // 7 + +T.Mul(31, 7) + threadIdx_x // 7 < 576 + +T.Mul(31, 49) + +T.Mul(31, 49) + threadIdx_x + +T.Mul(31, 49) + threadIdx_x < 4032 + +T.Mul(31, 7) + +T.Mul(31, 7) + threadIdx_x // 7 + +(T.Mul(31, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 + +T.Mul(31, 7) + +T.Mul(31, 7) + threadIdx_x // 7 + +(T.Mul(31, 7) + threadIdx_x // 7) % 9 + +(T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(31, 7) + +T.Mul(31, 7) + threadIdx_x // 7 + +(T.Mul(31, 7) + threadIdx_x // 7) // 9 + +(T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(31, 7) + +T.Mul(31, 7) + threadIdx_x // 7 + +(T.Mul(31, 7) + threadIdx_x // 7) % 9 + +(T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(31, 49) + +T.Mul(31, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(31, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(31, 7) + threadIdx_x // 7 < 576: + if T.Mul(31, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(31, 7) + threadIdx_x // 7 < 576: + if T.Mul(31, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +32 + +T.Mul(32, 7) + +T.Mul(32, 7) + threadIdx_x // 7 + +T.Mul(32, 7) + threadIdx_x // 7 < 576 + +T.Mul(32, 49) + +T.Mul(32, 49) + threadIdx_x + +T.Mul(32, 49) + threadIdx_x < 4032 + +T.Mul(32, 7) + +T.Mul(32, 7) + threadIdx_x // 7 + +(T.Mul(32, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 + +T.Mul(32, 7) + +T.Mul(32, 7) + threadIdx_x // 7 + +(T.Mul(32, 7) + threadIdx_x // 7) % 9 + +(T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(32, 7) + +T.Mul(32, 7) + threadIdx_x // 7 + +(T.Mul(32, 7) + threadIdx_x // 7) // 9 + +(T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(32, 7) + +T.Mul(32, 7) + threadIdx_x // 7 + +(T.Mul(32, 7) + threadIdx_x // 7) % 9 + +(T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(32, 49) + +T.Mul(32, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(32, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(32, 7) + threadIdx_x // 7 < 576: + if T.Mul(32, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(32, 7) + threadIdx_x // 7 < 576: + if T.Mul(32, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +33 + +T.Mul(33, 7) + +T.Mul(33, 7) + threadIdx_x // 7 + +T.Mul(33, 7) + threadIdx_x // 7 < 576 + +T.Mul(33, 49) + +T.Mul(33, 49) + threadIdx_x + +T.Mul(33, 49) + threadIdx_x < 4032 + +T.Mul(33, 7) + +T.Mul(33, 7) + threadIdx_x // 7 + +(T.Mul(33, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 + +T.Mul(33, 7) + +T.Mul(33, 7) + threadIdx_x // 7 + +(T.Mul(33, 7) + threadIdx_x // 7) % 9 + +(T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(33, 7) + +T.Mul(33, 7) + threadIdx_x // 7 + +(T.Mul(33, 7) + threadIdx_x // 7) // 9 + +(T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(33, 7) + +T.Mul(33, 7) + threadIdx_x // 7 + +(T.Mul(33, 7) + threadIdx_x // 7) % 9 + +(T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(33, 49) + +T.Mul(33, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(33, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(33, 7) + threadIdx_x // 7 < 576: + if T.Mul(33, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(33, 7) + threadIdx_x // 7 < 576: + if T.Mul(33, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +34 + +T.Mul(34, 7) + +T.Mul(34, 7) + threadIdx_x // 7 + +T.Mul(34, 7) + threadIdx_x // 7 < 576 + +T.Mul(34, 49) + +T.Mul(34, 49) + threadIdx_x + +T.Mul(34, 49) + threadIdx_x < 4032 + +T.Mul(34, 7) + +T.Mul(34, 7) + threadIdx_x // 7 + +(T.Mul(34, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 + +T.Mul(34, 7) + +T.Mul(34, 7) + threadIdx_x // 7 + +(T.Mul(34, 7) + threadIdx_x // 7) % 9 + +(T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(34, 7) + +T.Mul(34, 7) + threadIdx_x // 7 + +(T.Mul(34, 7) + threadIdx_x // 7) // 9 + +(T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(34, 7) + +T.Mul(34, 7) + threadIdx_x // 7 + +(T.Mul(34, 7) + threadIdx_x // 7) % 9 + +(T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(34, 49) + +T.Mul(34, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(34, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(34, 7) + threadIdx_x // 7 < 576: + if T.Mul(34, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(34, 7) + threadIdx_x // 7 < 576: + if T.Mul(34, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +35 + +T.Mul(35, 7) + +T.Mul(35, 7) + threadIdx_x // 7 + +T.Mul(35, 7) + threadIdx_x // 7 < 576 + +T.Mul(35, 49) + +T.Mul(35, 49) + threadIdx_x + +T.Mul(35, 49) + threadIdx_x < 4032 + +T.Mul(35, 7) + +T.Mul(35, 7) + threadIdx_x // 7 + +(T.Mul(35, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 + +T.Mul(35, 7) + +T.Mul(35, 7) + threadIdx_x // 7 + +(T.Mul(35, 7) + threadIdx_x // 7) % 9 + +(T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(35, 7) + +T.Mul(35, 7) + threadIdx_x // 7 + +(T.Mul(35, 7) + threadIdx_x // 7) // 9 + +(T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(35, 7) + +T.Mul(35, 7) + threadIdx_x // 7 + +(T.Mul(35, 7) + threadIdx_x // 7) % 9 + +(T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(35, 49) + +T.Mul(35, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(35, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(35, 7) + threadIdx_x // 7 < 576: + if T.Mul(35, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(35, 7) + threadIdx_x // 7 < 576: + if T.Mul(35, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +36 + +T.Mul(36, 7) + +T.Mul(36, 7) + threadIdx_x // 7 + +T.Mul(36, 7) + threadIdx_x // 7 < 576 + +T.Mul(36, 49) + +T.Mul(36, 49) + threadIdx_x + +T.Mul(36, 49) + threadIdx_x < 4032 + +T.Mul(36, 7) + +T.Mul(36, 7) + threadIdx_x // 7 + +(T.Mul(36, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 + +T.Mul(36, 7) + +T.Mul(36, 7) + threadIdx_x // 7 + +(T.Mul(36, 7) + threadIdx_x // 7) % 9 + +(T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(36, 7) + +T.Mul(36, 7) + threadIdx_x // 7 + +(T.Mul(36, 7) + threadIdx_x // 7) // 9 + +(T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(36, 7) + +T.Mul(36, 7) + threadIdx_x // 7 + +(T.Mul(36, 7) + threadIdx_x // 7) % 9 + +(T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(36, 49) + +T.Mul(36, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(36, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(36, 7) + threadIdx_x // 7 < 576: + if T.Mul(36, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(36, 7) + threadIdx_x // 7 < 576: + if T.Mul(36, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +37 + +T.Mul(37, 7) + +T.Mul(37, 7) + threadIdx_x // 7 + +T.Mul(37, 7) + threadIdx_x // 7 < 576 + +T.Mul(37, 49) + +T.Mul(37, 49) + threadIdx_x + +T.Mul(37, 49) + threadIdx_x < 4032 + +T.Mul(37, 7) + +T.Mul(37, 7) + threadIdx_x // 7 + +(T.Mul(37, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 + +T.Mul(37, 7) + +T.Mul(37, 7) + threadIdx_x // 7 + +(T.Mul(37, 7) + threadIdx_x // 7) % 9 + +(T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(37, 7) + +T.Mul(37, 7) + threadIdx_x // 7 + +(T.Mul(37, 7) + threadIdx_x // 7) // 9 + +(T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(37, 7) + +T.Mul(37, 7) + threadIdx_x // 7 + +(T.Mul(37, 7) + threadIdx_x // 7) % 9 + +(T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(37, 49) + +T.Mul(37, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(37, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(37, 7) + threadIdx_x // 7 < 576: + if T.Mul(37, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(37, 7) + threadIdx_x // 7 < 576: + if T.Mul(37, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +38 + +T.Mul(38, 7) + +T.Mul(38, 7) + threadIdx_x // 7 + +T.Mul(38, 7) + threadIdx_x // 7 < 576 + +T.Mul(38, 49) + +T.Mul(38, 49) + threadIdx_x + +T.Mul(38, 49) + threadIdx_x < 4032 + +T.Mul(38, 7) + +T.Mul(38, 7) + threadIdx_x // 7 + +(T.Mul(38, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 + +T.Mul(38, 7) + +T.Mul(38, 7) + threadIdx_x // 7 + +(T.Mul(38, 7) + threadIdx_x // 7) % 9 + +(T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(38, 7) + +T.Mul(38, 7) + threadIdx_x // 7 + +(T.Mul(38, 7) + threadIdx_x // 7) // 9 + +(T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(38, 7) + +T.Mul(38, 7) + threadIdx_x // 7 + +(T.Mul(38, 7) + threadIdx_x // 7) % 9 + +(T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(38, 49) + +T.Mul(38, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(38, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(38, 7) + threadIdx_x // 7 < 576: + if T.Mul(38, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(38, 7) + threadIdx_x // 7 < 576: + if T.Mul(38, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +39 + +T.Mul(39, 7) + +T.Mul(39, 7) + threadIdx_x // 7 + +T.Mul(39, 7) + threadIdx_x // 7 < 576 + +T.Mul(39, 49) + +T.Mul(39, 49) + threadIdx_x + +T.Mul(39, 49) + threadIdx_x < 4032 + +T.Mul(39, 7) + +T.Mul(39, 7) + threadIdx_x // 7 + +(T.Mul(39, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 + +T.Mul(39, 7) + +T.Mul(39, 7) + threadIdx_x // 7 + +(T.Mul(39, 7) + threadIdx_x // 7) % 9 + +(T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(39, 7) + +T.Mul(39, 7) + threadIdx_x // 7 + +(T.Mul(39, 7) + threadIdx_x // 7) // 9 + +(T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(39, 7) + +T.Mul(39, 7) + threadIdx_x // 7 + +(T.Mul(39, 7) + threadIdx_x // 7) % 9 + +(T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(39, 49) + +T.Mul(39, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(39, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(39, 7) + threadIdx_x // 7 < 576: + if T.Mul(39, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(39, 7) + threadIdx_x // 7 < 576: + if T.Mul(39, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +40 + +T.Mul(40, 7) + +T.Mul(40, 7) + threadIdx_x // 7 + +T.Mul(40, 7) + threadIdx_x // 7 < 576 + +T.Mul(40, 49) + +T.Mul(40, 49) + threadIdx_x + +T.Mul(40, 49) + threadIdx_x < 4032 + +T.Mul(40, 7) + +T.Mul(40, 7) + threadIdx_x // 7 + +(T.Mul(40, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 + +T.Mul(40, 7) + +T.Mul(40, 7) + threadIdx_x // 7 + +(T.Mul(40, 7) + threadIdx_x // 7) % 9 + +(T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(40, 7) + +T.Mul(40, 7) + threadIdx_x // 7 + +(T.Mul(40, 7) + threadIdx_x // 7) // 9 + +(T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(40, 7) + +T.Mul(40, 7) + threadIdx_x // 7 + +(T.Mul(40, 7) + threadIdx_x // 7) % 9 + +(T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(40, 49) + +T.Mul(40, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(40, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(40, 7) + threadIdx_x // 7 < 576: + if T.Mul(40, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(40, 7) + threadIdx_x // 7 < 576: + if T.Mul(40, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +41 + +T.Mul(41, 7) + +T.Mul(41, 7) + threadIdx_x // 7 + +T.Mul(41, 7) + threadIdx_x // 7 < 576 + +T.Mul(41, 49) + +T.Mul(41, 49) + threadIdx_x + +T.Mul(41, 49) + threadIdx_x < 4032 + +T.Mul(41, 7) + +T.Mul(41, 7) + threadIdx_x // 7 + +(T.Mul(41, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 + +T.Mul(41, 7) + +T.Mul(41, 7) + threadIdx_x // 7 + +(T.Mul(41, 7) + threadIdx_x // 7) % 9 + +(T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(41, 7) + +T.Mul(41, 7) + threadIdx_x // 7 + +(T.Mul(41, 7) + threadIdx_x // 7) // 9 + +(T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(41, 7) + +T.Mul(41, 7) + threadIdx_x // 7 + +(T.Mul(41, 7) + threadIdx_x // 7) % 9 + +(T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(41, 49) + +T.Mul(41, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(41, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(41, 7) + threadIdx_x // 7 < 576: + if T.Mul(41, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(41, 7) + threadIdx_x // 7 < 576: + if T.Mul(41, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +42 + +T.Mul(42, 7) + +T.Mul(42, 7) + threadIdx_x // 7 + +T.Mul(42, 7) + threadIdx_x // 7 < 576 + +T.Mul(42, 49) + +T.Mul(42, 49) + threadIdx_x + +T.Mul(42, 49) + threadIdx_x < 4032 + +T.Mul(42, 7) + +T.Mul(42, 7) + threadIdx_x // 7 + +(T.Mul(42, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 + +T.Mul(42, 7) + +T.Mul(42, 7) + threadIdx_x // 7 + +(T.Mul(42, 7) + threadIdx_x // 7) % 9 + +(T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(42, 7) + +T.Mul(42, 7) + threadIdx_x // 7 + +(T.Mul(42, 7) + threadIdx_x // 7) // 9 + +(T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(42, 7) + +T.Mul(42, 7) + threadIdx_x // 7 + +(T.Mul(42, 7) + threadIdx_x // 7) % 9 + +(T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(42, 49) + +T.Mul(42, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(42, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(42, 7) + threadIdx_x // 7 < 576: + if T.Mul(42, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(42, 7) + threadIdx_x // 7 < 576: + if T.Mul(42, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +43 + +T.Mul(43, 7) + +T.Mul(43, 7) + threadIdx_x // 7 + +T.Mul(43, 7) + threadIdx_x // 7 < 576 + +T.Mul(43, 49) + +T.Mul(43, 49) + threadIdx_x + +T.Mul(43, 49) + threadIdx_x < 4032 + +T.Mul(43, 7) + +T.Mul(43, 7) + threadIdx_x // 7 + +(T.Mul(43, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 + +T.Mul(43, 7) + +T.Mul(43, 7) + threadIdx_x // 7 + +(T.Mul(43, 7) + threadIdx_x // 7) % 9 + +(T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(43, 7) + +T.Mul(43, 7) + threadIdx_x // 7 + +(T.Mul(43, 7) + threadIdx_x // 7) // 9 + +(T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(43, 7) + +T.Mul(43, 7) + threadIdx_x // 7 + +(T.Mul(43, 7) + threadIdx_x // 7) % 9 + +(T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(43, 49) + +T.Mul(43, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(43, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(43, 7) + threadIdx_x // 7 < 576: + if T.Mul(43, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(43, 7) + threadIdx_x // 7 < 576: + if T.Mul(43, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +44 + +T.Mul(44, 7) + +T.Mul(44, 7) + threadIdx_x // 7 + +T.Mul(44, 7) + threadIdx_x // 7 < 576 + +T.Mul(44, 49) + +T.Mul(44, 49) + threadIdx_x + +T.Mul(44, 49) + threadIdx_x < 4032 + +T.Mul(44, 7) + +T.Mul(44, 7) + threadIdx_x // 7 + +(T.Mul(44, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 + +T.Mul(44, 7) + +T.Mul(44, 7) + threadIdx_x // 7 + +(T.Mul(44, 7) + threadIdx_x // 7) % 9 + +(T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(44, 7) + +T.Mul(44, 7) + threadIdx_x // 7 + +(T.Mul(44, 7) + threadIdx_x // 7) // 9 + +(T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(44, 7) + +T.Mul(44, 7) + threadIdx_x // 7 + +(T.Mul(44, 7) + threadIdx_x // 7) % 9 + +(T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(44, 49) + +T.Mul(44, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(44, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(44, 7) + threadIdx_x // 7 < 576: + if T.Mul(44, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(44, 7) + threadIdx_x // 7 < 576: + if T.Mul(44, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +45 + +T.Mul(45, 7) + +T.Mul(45, 7) + threadIdx_x // 7 + +T.Mul(45, 7) + threadIdx_x // 7 < 576 + +T.Mul(45, 49) + +T.Mul(45, 49) + threadIdx_x + +T.Mul(45, 49) + threadIdx_x < 4032 + +T.Mul(45, 7) + +T.Mul(45, 7) + threadIdx_x // 7 + +(T.Mul(45, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 + +T.Mul(45, 7) + +T.Mul(45, 7) + threadIdx_x // 7 + +(T.Mul(45, 7) + threadIdx_x // 7) % 9 + +(T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(45, 7) + +T.Mul(45, 7) + threadIdx_x // 7 + +(T.Mul(45, 7) + threadIdx_x // 7) // 9 + +(T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(45, 7) + +T.Mul(45, 7) + threadIdx_x // 7 + +(T.Mul(45, 7) + threadIdx_x // 7) % 9 + +(T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(45, 49) + +T.Mul(45, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(45, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(45, 7) + threadIdx_x // 7 < 576: + if T.Mul(45, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(45, 7) + threadIdx_x // 7 < 576: + if T.Mul(45, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +46 + +T.Mul(46, 7) + +T.Mul(46, 7) + threadIdx_x // 7 + +T.Mul(46, 7) + threadIdx_x // 7 < 576 + +T.Mul(46, 49) + +T.Mul(46, 49) + threadIdx_x + +T.Mul(46, 49) + threadIdx_x < 4032 + +T.Mul(46, 7) + +T.Mul(46, 7) + threadIdx_x // 7 + +(T.Mul(46, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 + +T.Mul(46, 7) + +T.Mul(46, 7) + threadIdx_x // 7 + +(T.Mul(46, 7) + threadIdx_x // 7) % 9 + +(T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(46, 7) + +T.Mul(46, 7) + threadIdx_x // 7 + +(T.Mul(46, 7) + threadIdx_x // 7) // 9 + +(T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(46, 7) + +T.Mul(46, 7) + threadIdx_x // 7 + +(T.Mul(46, 7) + threadIdx_x // 7) % 9 + +(T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(46, 49) + +T.Mul(46, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(46, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(46, 7) + threadIdx_x // 7 < 576: + if T.Mul(46, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(46, 7) + threadIdx_x // 7 < 576: + if T.Mul(46, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +47 + +T.Mul(47, 7) + +T.Mul(47, 7) + threadIdx_x // 7 + +T.Mul(47, 7) + threadIdx_x // 7 < 576 + +T.Mul(47, 49) + +T.Mul(47, 49) + threadIdx_x + +T.Mul(47, 49) + threadIdx_x < 4032 + +T.Mul(47, 7) + +T.Mul(47, 7) + threadIdx_x // 7 + +(T.Mul(47, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 + +T.Mul(47, 7) + +T.Mul(47, 7) + threadIdx_x // 7 + +(T.Mul(47, 7) + threadIdx_x // 7) % 9 + +(T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(47, 7) + +T.Mul(47, 7) + threadIdx_x // 7 + +(T.Mul(47, 7) + threadIdx_x // 7) // 9 + +(T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(47, 7) + +T.Mul(47, 7) + threadIdx_x // 7 + +(T.Mul(47, 7) + threadIdx_x // 7) % 9 + +(T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(47, 49) + +T.Mul(47, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(47, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(47, 7) + threadIdx_x // 7 < 576: + if T.Mul(47, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(47, 7) + threadIdx_x // 7 < 576: + if T.Mul(47, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +48 + +T.Mul(48, 7) + +T.Mul(48, 7) + threadIdx_x // 7 + +T.Mul(48, 7) + threadIdx_x // 7 < 576 + +T.Mul(48, 49) + +T.Mul(48, 49) + threadIdx_x + +T.Mul(48, 49) + threadIdx_x < 4032 + +T.Mul(48, 7) + +T.Mul(48, 7) + threadIdx_x // 7 + +(T.Mul(48, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 + +T.Mul(48, 7) + +T.Mul(48, 7) + threadIdx_x // 7 + +(T.Mul(48, 7) + threadIdx_x // 7) % 9 + +(T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(48, 7) + +T.Mul(48, 7) + threadIdx_x // 7 + +(T.Mul(48, 7) + threadIdx_x // 7) // 9 + +(T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(48, 7) + +T.Mul(48, 7) + threadIdx_x // 7 + +(T.Mul(48, 7) + threadIdx_x // 7) % 9 + +(T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(48, 49) + +T.Mul(48, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(48, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(48, 7) + threadIdx_x // 7 < 576: + if T.Mul(48, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(48, 7) + threadIdx_x // 7 < 576: + if T.Mul(48, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +49 + +T.Mul(49, 7) + +T.Mul(49, 7) + threadIdx_x // 7 + +T.Mul(49, 7) + threadIdx_x // 7 < 576 + +T.Mul(49, 49) + +T.Mul(49, 49) + threadIdx_x + +T.Mul(49, 49) + threadIdx_x < 4032 + +T.Mul(49, 7) + +T.Mul(49, 7) + threadIdx_x // 7 + +(T.Mul(49, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 + +T.Mul(49, 7) + +T.Mul(49, 7) + threadIdx_x // 7 + +(T.Mul(49, 7) + threadIdx_x // 7) % 9 + +(T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(49, 7) + +T.Mul(49, 7) + threadIdx_x // 7 + +(T.Mul(49, 7) + threadIdx_x // 7) // 9 + +(T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(49, 7) + +T.Mul(49, 7) + threadIdx_x // 7 + +(T.Mul(49, 7) + threadIdx_x // 7) % 9 + +(T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(49, 49) + +T.Mul(49, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(49, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(49, 7) + threadIdx_x // 7 < 576: + if T.Mul(49, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(49, 7) + threadIdx_x // 7 < 576: + if T.Mul(49, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +50 + +T.Mul(50, 7) + +T.Mul(50, 7) + threadIdx_x // 7 + +T.Mul(50, 7) + threadIdx_x // 7 < 576 + +T.Mul(50, 49) + +T.Mul(50, 49) + threadIdx_x + +T.Mul(50, 49) + threadIdx_x < 4032 + +T.Mul(50, 7) + +T.Mul(50, 7) + threadIdx_x // 7 + +(T.Mul(50, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 + +T.Mul(50, 7) + +T.Mul(50, 7) + threadIdx_x // 7 + +(T.Mul(50, 7) + threadIdx_x // 7) % 9 + +(T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(50, 7) + +T.Mul(50, 7) + threadIdx_x // 7 + +(T.Mul(50, 7) + threadIdx_x // 7) // 9 + +(T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(50, 7) + +T.Mul(50, 7) + threadIdx_x // 7 + +(T.Mul(50, 7) + threadIdx_x // 7) % 9 + +(T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(50, 49) + +T.Mul(50, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(50, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(50, 7) + threadIdx_x // 7 < 576: + if T.Mul(50, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(50, 7) + threadIdx_x // 7 < 576: + if T.Mul(50, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +51 + +T.Mul(51, 7) + +T.Mul(51, 7) + threadIdx_x // 7 + +T.Mul(51, 7) + threadIdx_x // 7 < 576 + +T.Mul(51, 49) + +T.Mul(51, 49) + threadIdx_x + +T.Mul(51, 49) + threadIdx_x < 4032 + +T.Mul(51, 7) + +T.Mul(51, 7) + threadIdx_x // 7 + +(T.Mul(51, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 + +T.Mul(51, 7) + +T.Mul(51, 7) + threadIdx_x // 7 + +(T.Mul(51, 7) + threadIdx_x // 7) % 9 + +(T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(51, 7) + +T.Mul(51, 7) + threadIdx_x // 7 + +(T.Mul(51, 7) + threadIdx_x // 7) // 9 + +(T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(51, 7) + +T.Mul(51, 7) + threadIdx_x // 7 + +(T.Mul(51, 7) + threadIdx_x // 7) % 9 + +(T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(51, 49) + +T.Mul(51, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(51, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(51, 7) + threadIdx_x // 7 < 576: + if T.Mul(51, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(51, 7) + threadIdx_x // 7 < 576: + if T.Mul(51, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +52 + +T.Mul(52, 7) + +T.Mul(52, 7) + threadIdx_x // 7 + +T.Mul(52, 7) + threadIdx_x // 7 < 576 + +T.Mul(52, 49) + +T.Mul(52, 49) + threadIdx_x + +T.Mul(52, 49) + threadIdx_x < 4032 + +T.Mul(52, 7) + +T.Mul(52, 7) + threadIdx_x // 7 + +(T.Mul(52, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 + +T.Mul(52, 7) + +T.Mul(52, 7) + threadIdx_x // 7 + +(T.Mul(52, 7) + threadIdx_x // 7) % 9 + +(T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(52, 7) + +T.Mul(52, 7) + threadIdx_x // 7 + +(T.Mul(52, 7) + threadIdx_x // 7) // 9 + +(T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(52, 7) + +T.Mul(52, 7) + threadIdx_x // 7 + +(T.Mul(52, 7) + threadIdx_x // 7) % 9 + +(T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(52, 49) + +T.Mul(52, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(52, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(52, 7) + threadIdx_x // 7 < 576: + if T.Mul(52, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(52, 7) + threadIdx_x // 7 < 576: + if T.Mul(52, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +53 + +T.Mul(53, 7) + +T.Mul(53, 7) + threadIdx_x // 7 + +T.Mul(53, 7) + threadIdx_x // 7 < 576 + +T.Mul(53, 49) + +T.Mul(53, 49) + threadIdx_x + +T.Mul(53, 49) + threadIdx_x < 4032 + +T.Mul(53, 7) + +T.Mul(53, 7) + threadIdx_x // 7 + +(T.Mul(53, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 + +T.Mul(53, 7) + +T.Mul(53, 7) + threadIdx_x // 7 + +(T.Mul(53, 7) + threadIdx_x // 7) % 9 + +(T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(53, 7) + +T.Mul(53, 7) + threadIdx_x // 7 + +(T.Mul(53, 7) + threadIdx_x // 7) // 9 + +(T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(53, 7) + +T.Mul(53, 7) + threadIdx_x // 7 + +(T.Mul(53, 7) + threadIdx_x // 7) % 9 + +(T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(53, 49) + +T.Mul(53, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(53, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(53, 7) + threadIdx_x // 7 < 576: + if T.Mul(53, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(53, 7) + threadIdx_x // 7 < 576: + if T.Mul(53, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +54 + +T.Mul(54, 7) + +T.Mul(54, 7) + threadIdx_x // 7 + +T.Mul(54, 7) + threadIdx_x // 7 < 576 + +T.Mul(54, 49) + +T.Mul(54, 49) + threadIdx_x + +T.Mul(54, 49) + threadIdx_x < 4032 + +T.Mul(54, 7) + +T.Mul(54, 7) + threadIdx_x // 7 + +(T.Mul(54, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 + +T.Mul(54, 7) + +T.Mul(54, 7) + threadIdx_x // 7 + +(T.Mul(54, 7) + threadIdx_x // 7) % 9 + +(T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(54, 7) + +T.Mul(54, 7) + threadIdx_x // 7 + +(T.Mul(54, 7) + threadIdx_x // 7) // 9 + +(T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(54, 7) + +T.Mul(54, 7) + threadIdx_x // 7 + +(T.Mul(54, 7) + threadIdx_x // 7) % 9 + +(T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(54, 49) + +T.Mul(54, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(54, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(54, 7) + threadIdx_x // 7 < 576: + if T.Mul(54, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(54, 7) + threadIdx_x // 7 < 576: + if T.Mul(54, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +55 + +T.Mul(55, 7) + +T.Mul(55, 7) + threadIdx_x // 7 + +T.Mul(55, 7) + threadIdx_x // 7 < 576 + +T.Mul(55, 49) + +T.Mul(55, 49) + threadIdx_x + +T.Mul(55, 49) + threadIdx_x < 4032 + +T.Mul(55, 7) + +T.Mul(55, 7) + threadIdx_x // 7 + +(T.Mul(55, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 + +T.Mul(55, 7) + +T.Mul(55, 7) + threadIdx_x // 7 + +(T.Mul(55, 7) + threadIdx_x // 7) % 9 + +(T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(55, 7) + +T.Mul(55, 7) + threadIdx_x // 7 + +(T.Mul(55, 7) + threadIdx_x // 7) // 9 + +(T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(55, 7) + +T.Mul(55, 7) + threadIdx_x // 7 + +(T.Mul(55, 7) + threadIdx_x // 7) % 9 + +(T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(55, 49) + +T.Mul(55, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(55, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(55, 7) + threadIdx_x // 7 < 576: + if T.Mul(55, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(55, 7) + threadIdx_x // 7 < 576: + if T.Mul(55, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +56 + +T.Mul(56, 7) + +T.Mul(56, 7) + threadIdx_x // 7 + +T.Mul(56, 7) + threadIdx_x // 7 < 576 + +T.Mul(56, 49) + +T.Mul(56, 49) + threadIdx_x + +T.Mul(56, 49) + threadIdx_x < 4032 + +T.Mul(56, 7) + +T.Mul(56, 7) + threadIdx_x // 7 + +(T.Mul(56, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 + +T.Mul(56, 7) + +T.Mul(56, 7) + threadIdx_x // 7 + +(T.Mul(56, 7) + threadIdx_x // 7) % 9 + +(T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(56, 7) + +T.Mul(56, 7) + threadIdx_x // 7 + +(T.Mul(56, 7) + threadIdx_x // 7) // 9 + +(T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(56, 7) + +T.Mul(56, 7) + threadIdx_x // 7 + +(T.Mul(56, 7) + threadIdx_x // 7) % 9 + +(T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(56, 49) + +T.Mul(56, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(56, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(56, 7) + threadIdx_x // 7 < 576: + if T.Mul(56, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(56, 7) + threadIdx_x // 7 < 576: + if T.Mul(56, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +57 + +T.Mul(57, 7) + +T.Mul(57, 7) + threadIdx_x // 7 + +T.Mul(57, 7) + threadIdx_x // 7 < 576 + +T.Mul(57, 49) + +T.Mul(57, 49) + threadIdx_x + +T.Mul(57, 49) + threadIdx_x < 4032 + +T.Mul(57, 7) + +T.Mul(57, 7) + threadIdx_x // 7 + +(T.Mul(57, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 + +T.Mul(57, 7) + +T.Mul(57, 7) + threadIdx_x // 7 + +(T.Mul(57, 7) + threadIdx_x // 7) % 9 + +(T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(57, 7) + +T.Mul(57, 7) + threadIdx_x // 7 + +(T.Mul(57, 7) + threadIdx_x // 7) // 9 + +(T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(57, 7) + +T.Mul(57, 7) + threadIdx_x // 7 + +(T.Mul(57, 7) + threadIdx_x // 7) % 9 + +(T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(57, 49) + +T.Mul(57, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(57, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(57, 7) + threadIdx_x // 7 < 576: + if T.Mul(57, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(57, 7) + threadIdx_x // 7 < 576: + if T.Mul(57, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +58 + +T.Mul(58, 7) + +T.Mul(58, 7) + threadIdx_x // 7 + +T.Mul(58, 7) + threadIdx_x // 7 < 576 + +T.Mul(58, 49) + +T.Mul(58, 49) + threadIdx_x + +T.Mul(58, 49) + threadIdx_x < 4032 + +T.Mul(58, 7) + +T.Mul(58, 7) + threadIdx_x // 7 + +(T.Mul(58, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 + +T.Mul(58, 7) + +T.Mul(58, 7) + threadIdx_x // 7 + +(T.Mul(58, 7) + threadIdx_x // 7) % 9 + +(T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(58, 7) + +T.Mul(58, 7) + threadIdx_x // 7 + +(T.Mul(58, 7) + threadIdx_x // 7) // 9 + +(T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(58, 7) + +T.Mul(58, 7) + threadIdx_x // 7 + +(T.Mul(58, 7) + threadIdx_x // 7) % 9 + +(T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(58, 49) + +T.Mul(58, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(58, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(58, 7) + threadIdx_x // 7 < 576: + if T.Mul(58, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(58, 7) + threadIdx_x // 7 < 576: + if T.Mul(58, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +59 + +T.Mul(59, 7) + +T.Mul(59, 7) + threadIdx_x // 7 + +T.Mul(59, 7) + threadIdx_x // 7 < 576 + +T.Mul(59, 49) + +T.Mul(59, 49) + threadIdx_x + +T.Mul(59, 49) + threadIdx_x < 4032 + +T.Mul(59, 7) + +T.Mul(59, 7) + threadIdx_x // 7 + +(T.Mul(59, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 + +T.Mul(59, 7) + +T.Mul(59, 7) + threadIdx_x // 7 + +(T.Mul(59, 7) + threadIdx_x // 7) % 9 + +(T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(59, 7) + +T.Mul(59, 7) + threadIdx_x // 7 + +(T.Mul(59, 7) + threadIdx_x // 7) // 9 + +(T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(59, 7) + +T.Mul(59, 7) + threadIdx_x // 7 + +(T.Mul(59, 7) + threadIdx_x // 7) % 9 + +(T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(59, 49) + +T.Mul(59, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(59, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(59, 7) + threadIdx_x // 7 < 576: + if T.Mul(59, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(59, 7) + threadIdx_x // 7 < 576: + if T.Mul(59, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +60 + +T.Mul(60, 7) + +T.Mul(60, 7) + threadIdx_x // 7 + +T.Mul(60, 7) + threadIdx_x // 7 < 576 + +T.Mul(60, 49) + +T.Mul(60, 49) + threadIdx_x + +T.Mul(60, 49) + threadIdx_x < 4032 + +T.Mul(60, 7) + +T.Mul(60, 7) + threadIdx_x // 7 + +(T.Mul(60, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 + +T.Mul(60, 7) + +T.Mul(60, 7) + threadIdx_x // 7 + +(T.Mul(60, 7) + threadIdx_x // 7) % 9 + +(T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(60, 7) + +T.Mul(60, 7) + threadIdx_x // 7 + +(T.Mul(60, 7) + threadIdx_x // 7) // 9 + +(T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(60, 7) + +T.Mul(60, 7) + threadIdx_x // 7 + +(T.Mul(60, 7) + threadIdx_x // 7) % 9 + +(T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(60, 49) + +T.Mul(60, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(60, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(60, 7) + threadIdx_x // 7 < 576: + if T.Mul(60, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(60, 7) + threadIdx_x // 7 < 576: + if T.Mul(60, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +61 + +T.Mul(61, 7) + +T.Mul(61, 7) + threadIdx_x // 7 + +T.Mul(61, 7) + threadIdx_x // 7 < 576 + +T.Mul(61, 49) + +T.Mul(61, 49) + threadIdx_x + +T.Mul(61, 49) + threadIdx_x < 4032 + +T.Mul(61, 7) + +T.Mul(61, 7) + threadIdx_x // 7 + +(T.Mul(61, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 + +T.Mul(61, 7) + +T.Mul(61, 7) + threadIdx_x // 7 + +(T.Mul(61, 7) + threadIdx_x // 7) % 9 + +(T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(61, 7) + +T.Mul(61, 7) + threadIdx_x // 7 + +(T.Mul(61, 7) + threadIdx_x // 7) // 9 + +(T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(61, 7) + +T.Mul(61, 7) + threadIdx_x // 7 + +(T.Mul(61, 7) + threadIdx_x // 7) % 9 + +(T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(61, 49) + +T.Mul(61, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(61, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(61, 7) + threadIdx_x // 7 < 576: + if T.Mul(61, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(61, 7) + threadIdx_x // 7 < 576: + if T.Mul(61, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +62 + +T.Mul(62, 7) + +T.Mul(62, 7) + threadIdx_x // 7 + +T.Mul(62, 7) + threadIdx_x // 7 < 576 + +T.Mul(62, 49) + +T.Mul(62, 49) + threadIdx_x + +T.Mul(62, 49) + threadIdx_x < 4032 + +T.Mul(62, 7) + +T.Mul(62, 7) + threadIdx_x // 7 + +(T.Mul(62, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 + +T.Mul(62, 7) + +T.Mul(62, 7) + threadIdx_x // 7 + +(T.Mul(62, 7) + threadIdx_x // 7) % 9 + +(T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(62, 7) + +T.Mul(62, 7) + threadIdx_x // 7 + +(T.Mul(62, 7) + threadIdx_x // 7) // 9 + +(T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(62, 7) + +T.Mul(62, 7) + threadIdx_x // 7 + +(T.Mul(62, 7) + threadIdx_x // 7) % 9 + +(T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(62, 49) + +T.Mul(62, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(62, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(62, 7) + threadIdx_x // 7 < 576: + if T.Mul(62, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(62, 7) + threadIdx_x // 7 < 576: + if T.Mul(62, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +63 + +T.Mul(63, 7) + +T.Mul(63, 7) + threadIdx_x // 7 + +T.Mul(63, 7) + threadIdx_x // 7 < 576 + +T.Mul(63, 49) + +T.Mul(63, 49) + threadIdx_x + +T.Mul(63, 49) + threadIdx_x < 4032 + +T.Mul(63, 7) + +T.Mul(63, 7) + threadIdx_x // 7 + +(T.Mul(63, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 + +T.Mul(63, 7) + +T.Mul(63, 7) + threadIdx_x // 7 + +(T.Mul(63, 7) + threadIdx_x // 7) % 9 + +(T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(63, 7) + +T.Mul(63, 7) + threadIdx_x // 7 + +(T.Mul(63, 7) + threadIdx_x // 7) // 9 + +(T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(63, 7) + +T.Mul(63, 7) + threadIdx_x // 7 + +(T.Mul(63, 7) + threadIdx_x // 7) % 9 + +(T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(63, 49) + +T.Mul(63, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(63, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(63, 7) + threadIdx_x // 7 < 576: + if T.Mul(63, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(63, 7) + threadIdx_x // 7 < 576: + if T.Mul(63, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +64 + +T.Mul(64, 7) + +T.Mul(64, 7) + threadIdx_x // 7 + +T.Mul(64, 7) + threadIdx_x // 7 < 576 + +T.Mul(64, 49) + +T.Mul(64, 49) + threadIdx_x + +T.Mul(64, 49) + threadIdx_x < 4032 + +T.Mul(64, 7) + +T.Mul(64, 7) + threadIdx_x // 7 + +(T.Mul(64, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 + +T.Mul(64, 7) + +T.Mul(64, 7) + threadIdx_x // 7 + +(T.Mul(64, 7) + threadIdx_x // 7) % 9 + +(T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(64, 7) + +T.Mul(64, 7) + threadIdx_x // 7 + +(T.Mul(64, 7) + threadIdx_x // 7) // 9 + +(T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(64, 7) + +T.Mul(64, 7) + threadIdx_x // 7 + +(T.Mul(64, 7) + threadIdx_x // 7) % 9 + +(T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(64, 49) + +T.Mul(64, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(64, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(64, 7) + threadIdx_x // 7 < 576: + if T.Mul(64, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(64, 7) + threadIdx_x // 7 < 576: + if T.Mul(64, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +65 + +T.Mul(65, 7) + +T.Mul(65, 7) + threadIdx_x // 7 + +T.Mul(65, 7) + threadIdx_x // 7 < 576 + +T.Mul(65, 49) + +T.Mul(65, 49) + threadIdx_x + +T.Mul(65, 49) + threadIdx_x < 4032 + +T.Mul(65, 7) + +T.Mul(65, 7) + threadIdx_x // 7 + +(T.Mul(65, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 + +T.Mul(65, 7) + +T.Mul(65, 7) + threadIdx_x // 7 + +(T.Mul(65, 7) + threadIdx_x // 7) % 9 + +(T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(65, 7) + +T.Mul(65, 7) + threadIdx_x // 7 + +(T.Mul(65, 7) + threadIdx_x // 7) // 9 + +(T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(65, 7) + +T.Mul(65, 7) + threadIdx_x // 7 + +(T.Mul(65, 7) + threadIdx_x // 7) % 9 + +(T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(65, 49) + +T.Mul(65, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(65, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(65, 7) + threadIdx_x // 7 < 576: + if T.Mul(65, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(65, 7) + threadIdx_x // 7 < 576: + if T.Mul(65, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +66 + +T.Mul(66, 7) + +T.Mul(66, 7) + threadIdx_x // 7 + +T.Mul(66, 7) + threadIdx_x // 7 < 576 + +T.Mul(66, 49) + +T.Mul(66, 49) + threadIdx_x + +T.Mul(66, 49) + threadIdx_x < 4032 + +T.Mul(66, 7) + +T.Mul(66, 7) + threadIdx_x // 7 + +(T.Mul(66, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 + +T.Mul(66, 7) + +T.Mul(66, 7) + threadIdx_x // 7 + +(T.Mul(66, 7) + threadIdx_x // 7) % 9 + +(T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(66, 7) + +T.Mul(66, 7) + threadIdx_x // 7 + +(T.Mul(66, 7) + threadIdx_x // 7) // 9 + +(T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(66, 7) + +T.Mul(66, 7) + threadIdx_x // 7 + +(T.Mul(66, 7) + threadIdx_x // 7) % 9 + +(T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(66, 49) + +T.Mul(66, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(66, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(66, 7) + threadIdx_x // 7 < 576: + if T.Mul(66, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(66, 7) + threadIdx_x // 7 < 576: + if T.Mul(66, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +67 + +T.Mul(67, 7) + +T.Mul(67, 7) + threadIdx_x // 7 + +T.Mul(67, 7) + threadIdx_x // 7 < 576 + +T.Mul(67, 49) + +T.Mul(67, 49) + threadIdx_x + +T.Mul(67, 49) + threadIdx_x < 4032 + +T.Mul(67, 7) + +T.Mul(67, 7) + threadIdx_x // 7 + +(T.Mul(67, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 + +T.Mul(67, 7) + +T.Mul(67, 7) + threadIdx_x // 7 + +(T.Mul(67, 7) + threadIdx_x // 7) % 9 + +(T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(67, 7) + +T.Mul(67, 7) + threadIdx_x // 7 + +(T.Mul(67, 7) + threadIdx_x // 7) // 9 + +(T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(67, 7) + +T.Mul(67, 7) + threadIdx_x // 7 + +(T.Mul(67, 7) + threadIdx_x // 7) % 9 + +(T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(67, 49) + +T.Mul(67, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(67, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(67, 7) + threadIdx_x // 7 < 576: + if T.Mul(67, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(67, 7) + threadIdx_x // 7 < 576: + if T.Mul(67, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +68 + +T.Mul(68, 7) + +T.Mul(68, 7) + threadIdx_x // 7 + +T.Mul(68, 7) + threadIdx_x // 7 < 576 + +T.Mul(68, 49) + +T.Mul(68, 49) + threadIdx_x + +T.Mul(68, 49) + threadIdx_x < 4032 + +T.Mul(68, 7) + +T.Mul(68, 7) + threadIdx_x // 7 + +(T.Mul(68, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 + +T.Mul(68, 7) + +T.Mul(68, 7) + threadIdx_x // 7 + +(T.Mul(68, 7) + threadIdx_x // 7) % 9 + +(T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(68, 7) + +T.Mul(68, 7) + threadIdx_x // 7 + +(T.Mul(68, 7) + threadIdx_x // 7) // 9 + +(T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(68, 7) + +T.Mul(68, 7) + threadIdx_x // 7 + +(T.Mul(68, 7) + threadIdx_x // 7) % 9 + +(T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(68, 49) + +T.Mul(68, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(68, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(68, 7) + threadIdx_x // 7 < 576: + if T.Mul(68, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(68, 7) + threadIdx_x // 7 < 576: + if T.Mul(68, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +69 + +T.Mul(69, 7) + +T.Mul(69, 7) + threadIdx_x // 7 + +T.Mul(69, 7) + threadIdx_x // 7 < 576 + +T.Mul(69, 49) + +T.Mul(69, 49) + threadIdx_x + +T.Mul(69, 49) + threadIdx_x < 4032 + +T.Mul(69, 7) + +T.Mul(69, 7) + threadIdx_x // 7 + +(T.Mul(69, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 + +T.Mul(69, 7) + +T.Mul(69, 7) + threadIdx_x // 7 + +(T.Mul(69, 7) + threadIdx_x // 7) % 9 + +(T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(69, 7) + +T.Mul(69, 7) + threadIdx_x // 7 + +(T.Mul(69, 7) + threadIdx_x // 7) // 9 + +(T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(69, 7) + +T.Mul(69, 7) + threadIdx_x // 7 + +(T.Mul(69, 7) + threadIdx_x // 7) % 9 + +(T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(69, 49) + +T.Mul(69, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(69, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(69, 7) + threadIdx_x // 7 < 576: + if T.Mul(69, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(69, 7) + threadIdx_x // 7 < 576: + if T.Mul(69, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +70 + +T.Mul(70, 7) + +T.Mul(70, 7) + threadIdx_x // 7 + +T.Mul(70, 7) + threadIdx_x // 7 < 576 + +T.Mul(70, 49) + +T.Mul(70, 49) + threadIdx_x + +T.Mul(70, 49) + threadIdx_x < 4032 + +T.Mul(70, 7) + +T.Mul(70, 7) + threadIdx_x // 7 + +(T.Mul(70, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 + +T.Mul(70, 7) + +T.Mul(70, 7) + threadIdx_x // 7 + +(T.Mul(70, 7) + threadIdx_x // 7) % 9 + +(T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(70, 7) + +T.Mul(70, 7) + threadIdx_x // 7 + +(T.Mul(70, 7) + threadIdx_x // 7) // 9 + +(T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(70, 7) + +T.Mul(70, 7) + threadIdx_x // 7 + +(T.Mul(70, 7) + threadIdx_x // 7) % 9 + +(T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(70, 49) + +T.Mul(70, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(70, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(70, 7) + threadIdx_x // 7 < 576: + if T.Mul(70, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(70, 7) + threadIdx_x // 7 < 576: + if T.Mul(70, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +71 + +T.Mul(71, 7) + +T.Mul(71, 7) + threadIdx_x // 7 + +T.Mul(71, 7) + threadIdx_x // 7 < 576 + +T.Mul(71, 49) + +T.Mul(71, 49) + threadIdx_x + +T.Mul(71, 49) + threadIdx_x < 4032 + +T.Mul(71, 7) + +T.Mul(71, 7) + threadIdx_x // 7 + +(T.Mul(71, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 + +T.Mul(71, 7) + +T.Mul(71, 7) + threadIdx_x // 7 + +(T.Mul(71, 7) + threadIdx_x // 7) % 9 + +(T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(71, 7) + +T.Mul(71, 7) + threadIdx_x // 7 + +(T.Mul(71, 7) + threadIdx_x // 7) // 9 + +(T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(71, 7) + +T.Mul(71, 7) + threadIdx_x // 7 + +(T.Mul(71, 7) + threadIdx_x // 7) % 9 + +(T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(71, 49) + +T.Mul(71, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(71, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(71, 7) + threadIdx_x // 7 < 576: + if T.Mul(71, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(71, 7) + threadIdx_x // 7 < 576: + if T.Mul(71, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +72 + +T.Mul(72, 7) + +T.Mul(72, 7) + threadIdx_x // 7 + +T.Mul(72, 7) + threadIdx_x // 7 < 576 + +T.Mul(72, 49) + +T.Mul(72, 49) + threadIdx_x + +T.Mul(72, 49) + threadIdx_x < 4032 + +T.Mul(72, 7) + +T.Mul(72, 7) + threadIdx_x // 7 + +(T.Mul(72, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 + +T.Mul(72, 7) + +T.Mul(72, 7) + threadIdx_x // 7 + +(T.Mul(72, 7) + threadIdx_x // 7) % 9 + +(T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(72, 7) + +T.Mul(72, 7) + threadIdx_x // 7 + +(T.Mul(72, 7) + threadIdx_x // 7) // 9 + +(T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(72, 7) + +T.Mul(72, 7) + threadIdx_x // 7 + +(T.Mul(72, 7) + threadIdx_x // 7) % 9 + +(T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(72, 49) + +T.Mul(72, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(72, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(72, 7) + threadIdx_x // 7 < 576: + if T.Mul(72, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(72, 7) + threadIdx_x // 7 < 576: + if T.Mul(72, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +73 + +T.Mul(73, 7) + +T.Mul(73, 7) + threadIdx_x // 7 + +T.Mul(73, 7) + threadIdx_x // 7 < 576 + +T.Mul(73, 49) + +T.Mul(73, 49) + threadIdx_x + +T.Mul(73, 49) + threadIdx_x < 4032 + +T.Mul(73, 7) + +T.Mul(73, 7) + threadIdx_x // 7 + +(T.Mul(73, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 + +T.Mul(73, 7) + +T.Mul(73, 7) + threadIdx_x // 7 + +(T.Mul(73, 7) + threadIdx_x // 7) % 9 + +(T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(73, 7) + +T.Mul(73, 7) + threadIdx_x // 7 + +(T.Mul(73, 7) + threadIdx_x // 7) // 9 + +(T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(73, 7) + +T.Mul(73, 7) + threadIdx_x // 7 + +(T.Mul(73, 7) + threadIdx_x // 7) % 9 + +(T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(73, 49) + +T.Mul(73, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(73, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(73, 7) + threadIdx_x // 7 < 576: + if T.Mul(73, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(73, 7) + threadIdx_x // 7 < 576: + if T.Mul(73, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +74 + +T.Mul(74, 7) + +T.Mul(74, 7) + threadIdx_x // 7 + +T.Mul(74, 7) + threadIdx_x // 7 < 576 + +T.Mul(74, 49) + +T.Mul(74, 49) + threadIdx_x + +T.Mul(74, 49) + threadIdx_x < 4032 + +T.Mul(74, 7) + +T.Mul(74, 7) + threadIdx_x // 7 + +(T.Mul(74, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 + +T.Mul(74, 7) + +T.Mul(74, 7) + threadIdx_x // 7 + +(T.Mul(74, 7) + threadIdx_x // 7) % 9 + +(T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(74, 7) + +T.Mul(74, 7) + threadIdx_x // 7 + +(T.Mul(74, 7) + threadIdx_x // 7) // 9 + +(T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(74, 7) + +T.Mul(74, 7) + threadIdx_x // 7 + +(T.Mul(74, 7) + threadIdx_x // 7) % 9 + +(T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(74, 49) + +T.Mul(74, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(74, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(74, 7) + threadIdx_x // 7 < 576: + if T.Mul(74, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(74, 7) + threadIdx_x // 7 < 576: + if T.Mul(74, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +75 + +T.Mul(75, 7) + +T.Mul(75, 7) + threadIdx_x // 7 + +T.Mul(75, 7) + threadIdx_x // 7 < 576 + +T.Mul(75, 49) + +T.Mul(75, 49) + threadIdx_x + +T.Mul(75, 49) + threadIdx_x < 4032 + +T.Mul(75, 7) + +T.Mul(75, 7) + threadIdx_x // 7 + +(T.Mul(75, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 + +T.Mul(75, 7) + +T.Mul(75, 7) + threadIdx_x // 7 + +(T.Mul(75, 7) + threadIdx_x // 7) % 9 + +(T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(75, 7) + +T.Mul(75, 7) + threadIdx_x // 7 + +(T.Mul(75, 7) + threadIdx_x // 7) // 9 + +(T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(75, 7) + +T.Mul(75, 7) + threadIdx_x // 7 + +(T.Mul(75, 7) + threadIdx_x // 7) % 9 + +(T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(75, 49) + +T.Mul(75, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(75, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(75, 7) + threadIdx_x // 7 < 576: + if T.Mul(75, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(75, 7) + threadIdx_x // 7 < 576: + if T.Mul(75, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +76 + +T.Mul(76, 7) + +T.Mul(76, 7) + threadIdx_x // 7 + +T.Mul(76, 7) + threadIdx_x // 7 < 576 + +T.Mul(76, 49) + +T.Mul(76, 49) + threadIdx_x + +T.Mul(76, 49) + threadIdx_x < 4032 + +T.Mul(76, 7) + +T.Mul(76, 7) + threadIdx_x // 7 + +(T.Mul(76, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 + +T.Mul(76, 7) + +T.Mul(76, 7) + threadIdx_x // 7 + +(T.Mul(76, 7) + threadIdx_x // 7) % 9 + +(T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(76, 7) + +T.Mul(76, 7) + threadIdx_x // 7 + +(T.Mul(76, 7) + threadIdx_x // 7) // 9 + +(T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(76, 7) + +T.Mul(76, 7) + threadIdx_x // 7 + +(T.Mul(76, 7) + threadIdx_x // 7) % 9 + +(T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(76, 49) + +T.Mul(76, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(76, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(76, 7) + threadIdx_x // 7 < 576: + if T.Mul(76, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(76, 7) + threadIdx_x // 7 < 576: + if T.Mul(76, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +77 + +T.Mul(77, 7) + +T.Mul(77, 7) + threadIdx_x // 7 + +T.Mul(77, 7) + threadIdx_x // 7 < 576 + +T.Mul(77, 49) + +T.Mul(77, 49) + threadIdx_x + +T.Mul(77, 49) + threadIdx_x < 4032 + +T.Mul(77, 7) + +T.Mul(77, 7) + threadIdx_x // 7 + +(T.Mul(77, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 + +T.Mul(77, 7) + +T.Mul(77, 7) + threadIdx_x // 7 + +(T.Mul(77, 7) + threadIdx_x // 7) % 9 + +(T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(77, 7) + +T.Mul(77, 7) + threadIdx_x // 7 + +(T.Mul(77, 7) + threadIdx_x // 7) // 9 + +(T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(77, 7) + +T.Mul(77, 7) + threadIdx_x // 7 + +(T.Mul(77, 7) + threadIdx_x // 7) % 9 + +(T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(77, 49) + +T.Mul(77, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(77, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(77, 7) + threadIdx_x // 7 < 576: + if T.Mul(77, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(77, 7) + threadIdx_x // 7 < 576: + if T.Mul(77, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +78 + +T.Mul(78, 7) + +T.Mul(78, 7) + threadIdx_x // 7 + +T.Mul(78, 7) + threadIdx_x // 7 < 576 + +T.Mul(78, 49) + +T.Mul(78, 49) + threadIdx_x + +T.Mul(78, 49) + threadIdx_x < 4032 + +T.Mul(78, 7) + +T.Mul(78, 7) + threadIdx_x // 7 + +(T.Mul(78, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 + +T.Mul(78, 7) + +T.Mul(78, 7) + threadIdx_x // 7 + +(T.Mul(78, 7) + threadIdx_x // 7) % 9 + +(T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(78, 7) + +T.Mul(78, 7) + threadIdx_x // 7 + +(T.Mul(78, 7) + threadIdx_x // 7) // 9 + +(T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(78, 7) + +T.Mul(78, 7) + threadIdx_x // 7 + +(T.Mul(78, 7) + threadIdx_x // 7) % 9 + +(T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(78, 49) + +T.Mul(78, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(78, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(78, 7) + threadIdx_x // 7 < 576: + if T.Mul(78, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(78, 7) + threadIdx_x // 7 < 576: + if T.Mul(78, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +79 + +T.Mul(79, 7) + +T.Mul(79, 7) + threadIdx_x // 7 + +T.Mul(79, 7) + threadIdx_x // 7 < 576 + +T.Mul(79, 49) + +T.Mul(79, 49) + threadIdx_x + +T.Mul(79, 49) + threadIdx_x < 4032 + +T.Mul(79, 7) + +T.Mul(79, 7) + threadIdx_x // 7 + +(T.Mul(79, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 + +T.Mul(79, 7) + +T.Mul(79, 7) + threadIdx_x // 7 + +(T.Mul(79, 7) + threadIdx_x // 7) % 9 + +(T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(79, 7) + +T.Mul(79, 7) + threadIdx_x // 7 + +(T.Mul(79, 7) + threadIdx_x // 7) // 9 + +(T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(79, 7) + +T.Mul(79, 7) + threadIdx_x // 7 + +(T.Mul(79, 7) + threadIdx_x // 7) % 9 + +(T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(79, 49) + +T.Mul(79, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(79, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(79, 7) + threadIdx_x // 7 < 576: + if T.Mul(79, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(79, 7) + threadIdx_x // 7 < 576: + if T.Mul(79, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +80 + +T.Mul(80, 7) + +T.Mul(80, 7) + threadIdx_x // 7 + +T.Mul(80, 7) + threadIdx_x // 7 < 576 + +T.Mul(80, 49) + +T.Mul(80, 49) + threadIdx_x + +T.Mul(80, 49) + threadIdx_x < 4032 + +T.Mul(80, 7) + +T.Mul(80, 7) + threadIdx_x // 7 + +(T.Mul(80, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 + +T.Mul(80, 7) + +T.Mul(80, 7) + threadIdx_x // 7 + +(T.Mul(80, 7) + threadIdx_x // 7) % 9 + +(T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(80, 7) + +T.Mul(80, 7) + threadIdx_x // 7 + +(T.Mul(80, 7) + threadIdx_x // 7) // 9 + +(T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(80, 7) + +T.Mul(80, 7) + threadIdx_x // 7 + +(T.Mul(80, 7) + threadIdx_x // 7) % 9 + +(T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(80, 49) + +T.Mul(80, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(80, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(80, 7) + threadIdx_x // 7 < 576: + if T.Mul(80, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(80, 7) + threadIdx_x // 7 < 576: + if T.Mul(80, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +81 + +T.Mul(81, 7) + +T.Mul(81, 7) + threadIdx_x // 7 + +T.Mul(81, 7) + threadIdx_x // 7 < 576 + +T.Mul(81, 49) + +T.Mul(81, 49) + threadIdx_x + +T.Mul(81, 49) + threadIdx_x < 4032 + +T.Mul(81, 7) + +T.Mul(81, 7) + threadIdx_x // 7 + +(T.Mul(81, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 + +T.Mul(81, 7) + +T.Mul(81, 7) + threadIdx_x // 7 + +(T.Mul(81, 7) + threadIdx_x // 7) % 9 + +(T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(81, 7) + +T.Mul(81, 7) + threadIdx_x // 7 + +(T.Mul(81, 7) + threadIdx_x // 7) // 9 + +(T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(81, 7) + +T.Mul(81, 7) + threadIdx_x // 7 + +(T.Mul(81, 7) + threadIdx_x // 7) % 9 + +(T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(81, 49) + +T.Mul(81, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(81, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(81, 7) + threadIdx_x // 7 < 576: + if T.Mul(81, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(81, 7) + threadIdx_x // 7 < 576: + if T.Mul(81, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +82 + +T.Mul(82, 7) + +T.Mul(82, 7) + threadIdx_x // 7 + +T.Mul(82, 7) + threadIdx_x // 7 < 576 + +T.Mul(82, 49) + +T.Mul(82, 49) + threadIdx_x + +T.Mul(82, 49) + threadIdx_x < 4032 + +T.Mul(82, 7) + +T.Mul(82, 7) + threadIdx_x // 7 + +(T.Mul(82, 7) + threadIdx_x // 7) % 9 + +1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 + +T.Mul(82, 7) + +T.Mul(82, 7) + threadIdx_x // 7 + +(T.Mul(82, 7) + threadIdx_x // 7) % 9 + +(T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 + +1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +T.Mul(82, 7) + +T.Mul(82, 7) + threadIdx_x // 7 + +(T.Mul(82, 7) + threadIdx_x // 7) // 9 + +(T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + +rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + +T.Mul(82, 7) + +T.Mul(82, 7) + threadIdx_x // 7 + +(T.Mul(82, 7) + threadIdx_x // 7) % 9 + +(T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + +rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + +rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +T.Mul(82, 49) + +T.Mul(82, 49) + threadIdx_x + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(82, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +threadIdx_x = T.int32() +if T.Mul(82, 7) + threadIdx_x // 7 < 576: + if T.Mul(82, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(82, 7) + threadIdx_x // 7 < 576: + if T.Mul(82, 49) + threadIdx_x < 4032: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +49 + +0 + +49 + +T.Mul(0, 49) + +threadIdx_x + +T.Mul(0, 49) + threadIdx_x + +1536 + +T.Mul(0, 49) + threadIdx_x < 1536 + +blockIdx_x + +36864 + +blockIdx_x * 36864 + +49 + +T.Mul(0, 49) + +T.Mul(0, 49) + threadIdx_x + +192 + +(T.Mul(0, 49) + threadIdx_x) // 192 + +4608 + +(T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +49 + +T.Mul(0, 49) + +T.Mul(0, 49) + threadIdx_x + +192 + +(T.Mul(0, 49) + threadIdx_x) % 192 + +3 + +(T.Mul(0, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +49 + +T.Mul(0, 49) + +T.Mul(0, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(0, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(0, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(0, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(0, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(0, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +1 + +T.Mul(1, 49) + +T.Mul(1, 49) + threadIdx_x + +T.Mul(1, 49) + threadIdx_x < 1536 + +T.Mul(1, 49) + +T.Mul(1, 49) + threadIdx_x + +(T.Mul(1, 49) + threadIdx_x) // 192 + +(T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(1, 49) + +T.Mul(1, 49) + threadIdx_x + +(T.Mul(1, 49) + threadIdx_x) % 192 + +(T.Mul(1, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(1, 49) + +T.Mul(1, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(1, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(1, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(1, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(1, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(1, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +2 + +T.Mul(2, 49) + +T.Mul(2, 49) + threadIdx_x + +T.Mul(2, 49) + threadIdx_x < 1536 + +T.Mul(2, 49) + +T.Mul(2, 49) + threadIdx_x + +(T.Mul(2, 49) + threadIdx_x) // 192 + +(T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(2, 49) + +T.Mul(2, 49) + threadIdx_x + +(T.Mul(2, 49) + threadIdx_x) % 192 + +(T.Mul(2, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(2, 49) + +T.Mul(2, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(2, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(2, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(2, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(2, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(2, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +3 + +T.Mul(3, 49) + +T.Mul(3, 49) + threadIdx_x + +T.Mul(3, 49) + threadIdx_x < 1536 + +T.Mul(3, 49) + +T.Mul(3, 49) + threadIdx_x + +(T.Mul(3, 49) + threadIdx_x) // 192 + +(T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(3, 49) + +T.Mul(3, 49) + threadIdx_x + +(T.Mul(3, 49) + threadIdx_x) % 192 + +(T.Mul(3, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(3, 49) + +T.Mul(3, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(3, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(3, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(3, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(3, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(3, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +4 + +T.Mul(4, 49) + +T.Mul(4, 49) + threadIdx_x + +T.Mul(4, 49) + threadIdx_x < 1536 + +T.Mul(4, 49) + +T.Mul(4, 49) + threadIdx_x + +(T.Mul(4, 49) + threadIdx_x) // 192 + +(T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(4, 49) + +T.Mul(4, 49) + threadIdx_x + +(T.Mul(4, 49) + threadIdx_x) % 192 + +(T.Mul(4, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(4, 49) + +T.Mul(4, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(4, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(4, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(4, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(4, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(4, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +5 + +T.Mul(5, 49) + +T.Mul(5, 49) + threadIdx_x + +T.Mul(5, 49) + threadIdx_x < 1536 + +T.Mul(5, 49) + +T.Mul(5, 49) + threadIdx_x + +(T.Mul(5, 49) + threadIdx_x) // 192 + +(T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(5, 49) + +T.Mul(5, 49) + threadIdx_x + +(T.Mul(5, 49) + threadIdx_x) % 192 + +(T.Mul(5, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(5, 49) + +T.Mul(5, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(5, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(5, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(5, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(5, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(5, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +6 + +T.Mul(6, 49) + +T.Mul(6, 49) + threadIdx_x + +T.Mul(6, 49) + threadIdx_x < 1536 + +T.Mul(6, 49) + +T.Mul(6, 49) + threadIdx_x + +(T.Mul(6, 49) + threadIdx_x) // 192 + +(T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(6, 49) + +T.Mul(6, 49) + threadIdx_x + +(T.Mul(6, 49) + threadIdx_x) % 192 + +(T.Mul(6, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(6, 49) + +T.Mul(6, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(6, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(6, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(6, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(6, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(6, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +7 + +T.Mul(7, 49) + +T.Mul(7, 49) + threadIdx_x + +T.Mul(7, 49) + threadIdx_x < 1536 + +T.Mul(7, 49) + +T.Mul(7, 49) + threadIdx_x + +(T.Mul(7, 49) + threadIdx_x) // 192 + +(T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(7, 49) + +T.Mul(7, 49) + threadIdx_x + +(T.Mul(7, 49) + threadIdx_x) % 192 + +(T.Mul(7, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(7, 49) + +T.Mul(7, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(7, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(7, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(7, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(7, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(7, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +8 + +T.Mul(8, 49) + +T.Mul(8, 49) + threadIdx_x + +T.Mul(8, 49) + threadIdx_x < 1536 + +T.Mul(8, 49) + +T.Mul(8, 49) + threadIdx_x + +(T.Mul(8, 49) + threadIdx_x) // 192 + +(T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(8, 49) + +T.Mul(8, 49) + threadIdx_x + +(T.Mul(8, 49) + threadIdx_x) % 192 + +(T.Mul(8, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(8, 49) + +T.Mul(8, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(8, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(8, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(8, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(8, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(8, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +9 + +T.Mul(9, 49) + +T.Mul(9, 49) + threadIdx_x + +T.Mul(9, 49) + threadIdx_x < 1536 + +T.Mul(9, 49) + +T.Mul(9, 49) + threadIdx_x + +(T.Mul(9, 49) + threadIdx_x) // 192 + +(T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(9, 49) + +T.Mul(9, 49) + threadIdx_x + +(T.Mul(9, 49) + threadIdx_x) % 192 + +(T.Mul(9, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(9, 49) + +T.Mul(9, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(9, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(9, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(9, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(9, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(9, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +10 + +T.Mul(10, 49) + +T.Mul(10, 49) + threadIdx_x + +T.Mul(10, 49) + threadIdx_x < 1536 + +T.Mul(10, 49) + +T.Mul(10, 49) + threadIdx_x + +(T.Mul(10, 49) + threadIdx_x) // 192 + +(T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(10, 49) + +T.Mul(10, 49) + threadIdx_x + +(T.Mul(10, 49) + threadIdx_x) % 192 + +(T.Mul(10, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(10, 49) + +T.Mul(10, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(10, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(10, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(10, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(10, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(10, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +11 + +T.Mul(11, 49) + +T.Mul(11, 49) + threadIdx_x + +T.Mul(11, 49) + threadIdx_x < 1536 + +T.Mul(11, 49) + +T.Mul(11, 49) + threadIdx_x + +(T.Mul(11, 49) + threadIdx_x) // 192 + +(T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(11, 49) + +T.Mul(11, 49) + threadIdx_x + +(T.Mul(11, 49) + threadIdx_x) % 192 + +(T.Mul(11, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(11, 49) + +T.Mul(11, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(11, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(11, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(11, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(11, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(11, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +12 + +T.Mul(12, 49) + +T.Mul(12, 49) + threadIdx_x + +T.Mul(12, 49) + threadIdx_x < 1536 + +T.Mul(12, 49) + +T.Mul(12, 49) + threadIdx_x + +(T.Mul(12, 49) + threadIdx_x) // 192 + +(T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(12, 49) + +T.Mul(12, 49) + threadIdx_x + +(T.Mul(12, 49) + threadIdx_x) % 192 + +(T.Mul(12, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(12, 49) + +T.Mul(12, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(12, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(12, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(12, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(12, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(12, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +13 + +T.Mul(13, 49) + +T.Mul(13, 49) + threadIdx_x + +T.Mul(13, 49) + threadIdx_x < 1536 + +T.Mul(13, 49) + +T.Mul(13, 49) + threadIdx_x + +(T.Mul(13, 49) + threadIdx_x) // 192 + +(T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(13, 49) + +T.Mul(13, 49) + threadIdx_x + +(T.Mul(13, 49) + threadIdx_x) % 192 + +(T.Mul(13, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(13, 49) + +T.Mul(13, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(13, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(13, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(13, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(13, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(13, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +14 + +T.Mul(14, 49) + +T.Mul(14, 49) + threadIdx_x + +T.Mul(14, 49) + threadIdx_x < 1536 + +T.Mul(14, 49) + +T.Mul(14, 49) + threadIdx_x + +(T.Mul(14, 49) + threadIdx_x) // 192 + +(T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(14, 49) + +T.Mul(14, 49) + threadIdx_x + +(T.Mul(14, 49) + threadIdx_x) % 192 + +(T.Mul(14, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(14, 49) + +T.Mul(14, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(14, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(14, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(14, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(14, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(14, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +15 + +T.Mul(15, 49) + +T.Mul(15, 49) + threadIdx_x + +T.Mul(15, 49) + threadIdx_x < 1536 + +T.Mul(15, 49) + +T.Mul(15, 49) + threadIdx_x + +(T.Mul(15, 49) + threadIdx_x) // 192 + +(T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(15, 49) + +T.Mul(15, 49) + threadIdx_x + +(T.Mul(15, 49) + threadIdx_x) % 192 + +(T.Mul(15, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(15, 49) + +T.Mul(15, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(15, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(15, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(15, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(15, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(15, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +16 + +T.Mul(16, 49) + +T.Mul(16, 49) + threadIdx_x + +T.Mul(16, 49) + threadIdx_x < 1536 + +T.Mul(16, 49) + +T.Mul(16, 49) + threadIdx_x + +(T.Mul(16, 49) + threadIdx_x) // 192 + +(T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(16, 49) + +T.Mul(16, 49) + threadIdx_x + +(T.Mul(16, 49) + threadIdx_x) % 192 + +(T.Mul(16, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(16, 49) + +T.Mul(16, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(16, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(16, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(16, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(16, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(16, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +17 + +T.Mul(17, 49) + +T.Mul(17, 49) + threadIdx_x + +T.Mul(17, 49) + threadIdx_x < 1536 + +T.Mul(17, 49) + +T.Mul(17, 49) + threadIdx_x + +(T.Mul(17, 49) + threadIdx_x) // 192 + +(T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(17, 49) + +T.Mul(17, 49) + threadIdx_x + +(T.Mul(17, 49) + threadIdx_x) % 192 + +(T.Mul(17, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(17, 49) + +T.Mul(17, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(17, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(17, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(17, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(17, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(17, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +18 + +T.Mul(18, 49) + +T.Mul(18, 49) + threadIdx_x + +T.Mul(18, 49) + threadIdx_x < 1536 + +T.Mul(18, 49) + +T.Mul(18, 49) + threadIdx_x + +(T.Mul(18, 49) + threadIdx_x) // 192 + +(T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(18, 49) + +T.Mul(18, 49) + threadIdx_x + +(T.Mul(18, 49) + threadIdx_x) % 192 + +(T.Mul(18, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(18, 49) + +T.Mul(18, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(18, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(18, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(18, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(18, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(18, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +19 + +T.Mul(19, 49) + +T.Mul(19, 49) + threadIdx_x + +T.Mul(19, 49) + threadIdx_x < 1536 + +T.Mul(19, 49) + +T.Mul(19, 49) + threadIdx_x + +(T.Mul(19, 49) + threadIdx_x) // 192 + +(T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(19, 49) + +T.Mul(19, 49) + threadIdx_x + +(T.Mul(19, 49) + threadIdx_x) % 192 + +(T.Mul(19, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(19, 49) + +T.Mul(19, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(19, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(19, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(19, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(19, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(19, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +20 + +T.Mul(20, 49) + +T.Mul(20, 49) + threadIdx_x + +T.Mul(20, 49) + threadIdx_x < 1536 + +T.Mul(20, 49) + +T.Mul(20, 49) + threadIdx_x + +(T.Mul(20, 49) + threadIdx_x) // 192 + +(T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(20, 49) + +T.Mul(20, 49) + threadIdx_x + +(T.Mul(20, 49) + threadIdx_x) % 192 + +(T.Mul(20, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(20, 49) + +T.Mul(20, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(20, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(20, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(20, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(20, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(20, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +21 + +T.Mul(21, 49) + +T.Mul(21, 49) + threadIdx_x + +T.Mul(21, 49) + threadIdx_x < 1536 + +T.Mul(21, 49) + +T.Mul(21, 49) + threadIdx_x + +(T.Mul(21, 49) + threadIdx_x) // 192 + +(T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(21, 49) + +T.Mul(21, 49) + threadIdx_x + +(T.Mul(21, 49) + threadIdx_x) % 192 + +(T.Mul(21, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(21, 49) + +T.Mul(21, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(21, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(21, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(21, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(21, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(21, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +22 + +T.Mul(22, 49) + +T.Mul(22, 49) + threadIdx_x + +T.Mul(22, 49) + threadIdx_x < 1536 + +T.Mul(22, 49) + +T.Mul(22, 49) + threadIdx_x + +(T.Mul(22, 49) + threadIdx_x) // 192 + +(T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(22, 49) + +T.Mul(22, 49) + threadIdx_x + +(T.Mul(22, 49) + threadIdx_x) % 192 + +(T.Mul(22, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(22, 49) + +T.Mul(22, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(22, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(22, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(22, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(22, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(22, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +23 + +T.Mul(23, 49) + +T.Mul(23, 49) + threadIdx_x + +T.Mul(23, 49) + threadIdx_x < 1536 + +T.Mul(23, 49) + +T.Mul(23, 49) + threadIdx_x + +(T.Mul(23, 49) + threadIdx_x) // 192 + +(T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(23, 49) + +T.Mul(23, 49) + threadIdx_x + +(T.Mul(23, 49) + threadIdx_x) % 192 + +(T.Mul(23, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(23, 49) + +T.Mul(23, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(23, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(23, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(23, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(23, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(23, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +24 + +T.Mul(24, 49) + +T.Mul(24, 49) + threadIdx_x + +T.Mul(24, 49) + threadIdx_x < 1536 + +T.Mul(24, 49) + +T.Mul(24, 49) + threadIdx_x + +(T.Mul(24, 49) + threadIdx_x) // 192 + +(T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(24, 49) + +T.Mul(24, 49) + threadIdx_x + +(T.Mul(24, 49) + threadIdx_x) % 192 + +(T.Mul(24, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(24, 49) + +T.Mul(24, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(24, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(24, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(24, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(24, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(24, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +25 + +T.Mul(25, 49) + +T.Mul(25, 49) + threadIdx_x + +T.Mul(25, 49) + threadIdx_x < 1536 + +T.Mul(25, 49) + +T.Mul(25, 49) + threadIdx_x + +(T.Mul(25, 49) + threadIdx_x) // 192 + +(T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(25, 49) + +T.Mul(25, 49) + threadIdx_x + +(T.Mul(25, 49) + threadIdx_x) % 192 + +(T.Mul(25, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(25, 49) + +T.Mul(25, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(25, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(25, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(25, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(25, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(25, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +26 + +T.Mul(26, 49) + +T.Mul(26, 49) + threadIdx_x + +T.Mul(26, 49) + threadIdx_x < 1536 + +T.Mul(26, 49) + +T.Mul(26, 49) + threadIdx_x + +(T.Mul(26, 49) + threadIdx_x) // 192 + +(T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(26, 49) + +T.Mul(26, 49) + threadIdx_x + +(T.Mul(26, 49) + threadIdx_x) % 192 + +(T.Mul(26, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(26, 49) + +T.Mul(26, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(26, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(26, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(26, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(26, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(26, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +27 + +T.Mul(27, 49) + +T.Mul(27, 49) + threadIdx_x + +T.Mul(27, 49) + threadIdx_x < 1536 + +T.Mul(27, 49) + +T.Mul(27, 49) + threadIdx_x + +(T.Mul(27, 49) + threadIdx_x) // 192 + +(T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(27, 49) + +T.Mul(27, 49) + threadIdx_x + +(T.Mul(27, 49) + threadIdx_x) % 192 + +(T.Mul(27, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(27, 49) + +T.Mul(27, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(27, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(27, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(27, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(27, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(27, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +28 + +T.Mul(28, 49) + +T.Mul(28, 49) + threadIdx_x + +T.Mul(28, 49) + threadIdx_x < 1536 + +T.Mul(28, 49) + +T.Mul(28, 49) + threadIdx_x + +(T.Mul(28, 49) + threadIdx_x) // 192 + +(T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(28, 49) + +T.Mul(28, 49) + threadIdx_x + +(T.Mul(28, 49) + threadIdx_x) % 192 + +(T.Mul(28, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(28, 49) + +T.Mul(28, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(28, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(28, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(28, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(28, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(28, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +29 + +T.Mul(29, 49) + +T.Mul(29, 49) + threadIdx_x + +T.Mul(29, 49) + threadIdx_x < 1536 + +T.Mul(29, 49) + +T.Mul(29, 49) + threadIdx_x + +(T.Mul(29, 49) + threadIdx_x) // 192 + +(T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(29, 49) + +T.Mul(29, 49) + threadIdx_x + +(T.Mul(29, 49) + threadIdx_x) % 192 + +(T.Mul(29, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(29, 49) + +T.Mul(29, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(29, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(29, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(29, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(29, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(29, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +30 + +T.Mul(30, 49) + +T.Mul(30, 49) + threadIdx_x + +T.Mul(30, 49) + threadIdx_x < 1536 + +T.Mul(30, 49) + +T.Mul(30, 49) + threadIdx_x + +(T.Mul(30, 49) + threadIdx_x) // 192 + +(T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(30, 49) + +T.Mul(30, 49) + threadIdx_x + +(T.Mul(30, 49) + threadIdx_x) % 192 + +(T.Mul(30, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(30, 49) + +T.Mul(30, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(30, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(30, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(30, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(30, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(30, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +31 + +T.Mul(31, 49) + +T.Mul(31, 49) + threadIdx_x + +T.Mul(31, 49) + threadIdx_x < 1536 + +T.Mul(31, 49) + +T.Mul(31, 49) + threadIdx_x + +(T.Mul(31, 49) + threadIdx_x) // 192 + +(T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + +blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + +T.Mul(31, 49) + +T.Mul(31, 49) + threadIdx_x + +(T.Mul(31, 49) + threadIdx_x) % 192 + +(T.Mul(31, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + +blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +T.Mul(31, 49) + +T.Mul(31, 49) + threadIdx_x + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[T.Mul(31, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +threadIdx_x = T.int32() +if T.Mul(31, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(31, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if T.Mul(31, 49) + threadIdx_x < 1536: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[T.Mul(31, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] + +0 + +8 + +0 + +4 + +T.Mul(0, 4) + +0 + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + +rc_outer_inner + +504 + +rc_outer_inner * 504 + +0 + +63 + +T.Mul(0, 63) + +rc_outer_inner * 504 + T.Mul(0, 63) + +0 + +7 + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + +threadIdx_x + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] + +768 + +T.Mul(0, 768) + +192 + +T.Mul(0, 192) + +T.Mul(0, 768) + T.Mul(0, 192) + +24 + +rc_outer_inner * 24 + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +3 + +T.Mul(0, 3) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + +T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +4 + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(0, 4) + +1 + +T.Mul(0, 4) + 1 + +conv2d_nchw[T.Mul(0, 4) + 1] + +T.Mul(0, 63) + +rc_outer_inner * 504 + T.Mul(0, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(1, 192) + +T.Mul(0, 768) + T.Mul(1, 192) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(0, 3) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + +T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(0, 4) + +2 + +T.Mul(0, 4) + 2 + +conv2d_nchw[T.Mul(0, 4) + 2] + +T.Mul(0, 63) + +rc_outer_inner * 504 + T.Mul(0, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(2, 192) + +T.Mul(0, 768) + T.Mul(2, 192) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(0, 3) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + +T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(0, 4) + +3 + +T.Mul(0, 4) + 3 + +conv2d_nchw[T.Mul(0, 4) + 3] + +T.Mul(0, 63) + +rc_outer_inner * 504 + T.Mul(0, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(3, 192) + +T.Mul(0, 768) + T.Mul(3, 192) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(0, 3) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + +T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + +1 + +T.Mul(1, 63) + +rc_outer_inner * 504 + T.Mul(1, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(0, 192) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(1, 3) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + +T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw[T.Mul(0, 4) + 1] + +T.Mul(1, 63) + +rc_outer_inner * 504 + T.Mul(1, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(1, 192) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(1, 3) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + +T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw[T.Mul(0, 4) + 2] + +T.Mul(1, 63) + +rc_outer_inner * 504 + T.Mul(1, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(2, 192) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(1, 3) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + +T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw[T.Mul(0, 4) + 3] + +T.Mul(1, 63) + +rc_outer_inner * 504 + T.Mul(1, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(3, 192) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(1, 3) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + +T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + +2 + +T.Mul(2, 63) + +rc_outer_inner * 504 + T.Mul(2, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(0, 192) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(2, 3) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + +T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw[T.Mul(0, 4) + 1] + +T.Mul(2, 63) + +rc_outer_inner * 504 + T.Mul(2, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(1, 192) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(2, 3) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + +T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw[T.Mul(0, 4) + 2] + +T.Mul(2, 63) + +rc_outer_inner * 504 + T.Mul(2, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(2, 192) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(2, 3) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + +T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw[T.Mul(0, 4) + 3] + +T.Mul(2, 63) + +rc_outer_inner * 504 + T.Mul(2, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(3, 192) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(2, 3) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + +T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + +3 + +T.Mul(3, 63) + +rc_outer_inner * 504 + T.Mul(3, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(0, 192) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(3, 3) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + +T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw[T.Mul(0, 4) + 1] + +T.Mul(3, 63) + +rc_outer_inner * 504 + T.Mul(3, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(1, 192) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(3, 3) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + +T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw[T.Mul(0, 4) + 2] + +T.Mul(3, 63) + +rc_outer_inner * 504 + T.Mul(3, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(2, 192) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(3, 3) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + +T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw[T.Mul(0, 4) + 3] + +T.Mul(3, 63) + +rc_outer_inner * 504 + T.Mul(3, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(3, 192) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(3, 3) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + +T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + +4 + +T.Mul(4, 63) + +rc_outer_inner * 504 + T.Mul(4, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(0, 192) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(4, 3) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + +T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw[T.Mul(0, 4) + 1] + +T.Mul(4, 63) + +rc_outer_inner * 504 + T.Mul(4, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(1, 192) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(4, 3) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + +T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw[T.Mul(0, 4) + 2] + +T.Mul(4, 63) + +rc_outer_inner * 504 + T.Mul(4, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(2, 192) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(4, 3) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + +T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw[T.Mul(0, 4) + 3] + +T.Mul(4, 63) + +rc_outer_inner * 504 + T.Mul(4, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(3, 192) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(4, 3) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + +T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + +5 + +T.Mul(5, 63) + +rc_outer_inner * 504 + T.Mul(5, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(0, 192) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(5, 3) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + +T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw[T.Mul(0, 4) + 1] + +T.Mul(5, 63) + +rc_outer_inner * 504 + T.Mul(5, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(1, 192) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(5, 3) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + +T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw[T.Mul(0, 4) + 2] + +T.Mul(5, 63) + +rc_outer_inner * 504 + T.Mul(5, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(2, 192) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(5, 3) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + +T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw[T.Mul(0, 4) + 3] + +T.Mul(5, 63) + +rc_outer_inner * 504 + T.Mul(5, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(3, 192) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(5, 3) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + +T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + +6 + +T.Mul(6, 63) + +rc_outer_inner * 504 + T.Mul(6, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(0, 192) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(6, 3) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + +T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw[T.Mul(0, 4) + 1] + +T.Mul(6, 63) + +rc_outer_inner * 504 + T.Mul(6, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(1, 192) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(6, 3) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + +T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw[T.Mul(0, 4) + 2] + +T.Mul(6, 63) + +rc_outer_inner * 504 + T.Mul(6, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(2, 192) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(6, 3) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + +T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw[T.Mul(0, 4) + 3] + +T.Mul(6, 63) + +rc_outer_inner * 504 + T.Mul(6, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(3, 192) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(6, 3) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + +T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + +7 + +T.Mul(7, 63) + +rc_outer_inner * 504 + T.Mul(7, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(0, 192) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(7, 3) + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + +T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(0, 4) + +T.Add(T.Mul(0, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw[T.Mul(0, 4) + 1] + +T.Mul(7, 63) + +rc_outer_inner * 504 + T.Mul(7, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(1, 192) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(7, 3) + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + +T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw[T.Mul(0, 4) + 2] + +T.Mul(7, 63) + +rc_outer_inner * 504 + T.Mul(7, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(2, 192) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(7, 3) + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + +T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw[T.Mul(0, 4) + 3] + +T.Mul(7, 63) + +rc_outer_inner * 504 + T.Mul(7, 63) + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(0, 768) + +T.Mul(0, 768) + T.Mul(3, 192) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(7, 3) + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + +T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) + +kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(0, 4) + +T.Mul(0, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +1 + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(0, 192) + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + +T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw[T.Mul(1, 4) + 1] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(1, 192) + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + +T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw[T.Mul(1, 4) + 2] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(2, 192) + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + +T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw[T.Mul(1, 4) + 3] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(3, 192) + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + +T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(0, 192) + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + +T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw[T.Mul(1, 4) + 1] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(1, 192) + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + +T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw[T.Mul(1, 4) + 2] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(2, 192) + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + +T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw[T.Mul(1, 4) + 3] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(3, 192) + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + +T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(0, 192) + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + +T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw[T.Mul(1, 4) + 1] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(1, 192) + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + +T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw[T.Mul(1, 4) + 2] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(2, 192) + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + +T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw[T.Mul(1, 4) + 3] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(3, 192) + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + +T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(0, 192) + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + +T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw[T.Mul(1, 4) + 1] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(1, 192) + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + +T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw[T.Mul(1, 4) + 2] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(2, 192) + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + +T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw[T.Mul(1, 4) + 3] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(3, 192) + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + +T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(0, 192) + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + +T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw[T.Mul(1, 4) + 1] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(1, 192) + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + +T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw[T.Mul(1, 4) + 2] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(2, 192) + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + +T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw[T.Mul(1, 4) + 3] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(3, 192) + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + +T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(0, 192) + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + +T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw[T.Mul(1, 4) + 1] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(1, 192) + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + +T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw[T.Mul(1, 4) + 2] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(2, 192) + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + +T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw[T.Mul(1, 4) + 3] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(3, 192) + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + +T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(0, 192) + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + +T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw[T.Mul(1, 4) + 1] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(1, 192) + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + +T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw[T.Mul(1, 4) + 2] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(2, 192) + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + +T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw[T.Mul(1, 4) + 3] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(3, 192) + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + +T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(0, 192) + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + +T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(1, 4) + +T.Add(T.Mul(1, 4), 0) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw[T.Mul(1, 4) + 1] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(1, 192) + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + +T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw[T.Mul(1, 4) + 2] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(2, 192) + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + +T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw[T.Mul(1, 4) + 3] + +T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] + +T.Mul(1, 768) + +T.Mul(1, 768) + T.Mul(3, 192) + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + +T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) + +kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +T.Mul(1, 4) + +T.Mul(1, 4) + 3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + +1 + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + +2 + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 + +kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + +rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] + +T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 + +kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] +conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] +conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +for rc_outer_inner in range(8): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + pad_temp_shared = T.Buffer((4032,), scope="shared") + threadIdx_x = T.int32() + kernel_shared = T.Buffer((1536,), scope="shared") + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +threadIdx_x = T.env_thread("threadIdx.x") +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +with T.launch_thread(threadIdx_x, 49): + if T.Mul(0, 7) + threadIdx_x // 7 < 576: + if T.Mul(0, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(1, 7) + threadIdx_x // 7 < 576: + if T.Mul(1, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(2, 7) + threadIdx_x // 7 < 576: + if T.Mul(2, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(3, 7) + threadIdx_x // 7 < 576: + if T.Mul(3, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(4, 7) + threadIdx_x // 7 < 576: + if T.Mul(4, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(5, 7) + threadIdx_x // 7 < 576: + if T.Mul(5, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(6, 7) + threadIdx_x // 7 < 576: + if T.Mul(6, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(7, 7) + threadIdx_x // 7 < 576: + if T.Mul(7, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(8, 7) + threadIdx_x // 7 < 576: + if T.Mul(8, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(9, 7) + threadIdx_x // 7 < 576: + if T.Mul(9, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(10, 7) + threadIdx_x // 7 < 576: + if T.Mul(10, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(11, 7) + threadIdx_x // 7 < 576: + if T.Mul(11, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(12, 7) + threadIdx_x // 7 < 576: + if T.Mul(12, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(13, 7) + threadIdx_x // 7 < 576: + if T.Mul(13, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(14, 7) + threadIdx_x // 7 < 576: + if T.Mul(14, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(15, 7) + threadIdx_x // 7 < 576: + if T.Mul(15, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(16, 7) + threadIdx_x // 7 < 576: + if T.Mul(16, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(17, 7) + threadIdx_x // 7 < 576: + if T.Mul(17, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(18, 7) + threadIdx_x // 7 < 576: + if T.Mul(18, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(19, 7) + threadIdx_x // 7 < 576: + if T.Mul(19, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(20, 7) + threadIdx_x // 7 < 576: + if T.Mul(20, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(21, 7) + threadIdx_x // 7 < 576: + if T.Mul(21, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(22, 7) + threadIdx_x // 7 < 576: + if T.Mul(22, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(23, 7) + threadIdx_x // 7 < 576: + if T.Mul(23, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(24, 7) + threadIdx_x // 7 < 576: + if T.Mul(24, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(25, 7) + threadIdx_x // 7 < 576: + if T.Mul(25, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(26, 7) + threadIdx_x // 7 < 576: + if T.Mul(26, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(27, 7) + threadIdx_x // 7 < 576: + if T.Mul(27, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(28, 7) + threadIdx_x // 7 < 576: + if T.Mul(28, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(29, 7) + threadIdx_x // 7 < 576: + if T.Mul(29, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(30, 7) + threadIdx_x // 7 < 576: + if T.Mul(30, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(31, 7) + threadIdx_x // 7 < 576: + if T.Mul(31, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(32, 7) + threadIdx_x // 7 < 576: + if T.Mul(32, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(33, 7) + threadIdx_x // 7 < 576: + if T.Mul(33, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(34, 7) + threadIdx_x // 7 < 576: + if T.Mul(34, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(35, 7) + threadIdx_x // 7 < 576: + if T.Mul(35, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(36, 7) + threadIdx_x // 7 < 576: + if T.Mul(36, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(37, 7) + threadIdx_x // 7 < 576: + if T.Mul(37, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(38, 7) + threadIdx_x // 7 < 576: + if T.Mul(38, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(39, 7) + threadIdx_x // 7 < 576: + if T.Mul(39, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(40, 7) + threadIdx_x // 7 < 576: + if T.Mul(40, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(41, 7) + threadIdx_x // 7 < 576: + if T.Mul(41, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(42, 7) + threadIdx_x // 7 < 576: + if T.Mul(42, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(43, 7) + threadIdx_x // 7 < 576: + if T.Mul(43, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(44, 7) + threadIdx_x // 7 < 576: + if T.Mul(44, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(45, 7) + threadIdx_x // 7 < 576: + if T.Mul(45, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(46, 7) + threadIdx_x // 7 < 576: + if T.Mul(46, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(47, 7) + threadIdx_x // 7 < 576: + if T.Mul(47, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(48, 7) + threadIdx_x // 7 < 576: + if T.Mul(48, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(49, 7) + threadIdx_x // 7 < 576: + if T.Mul(49, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(50, 7) + threadIdx_x // 7 < 576: + if T.Mul(50, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(51, 7) + threadIdx_x // 7 < 576: + if T.Mul(51, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(52, 7) + threadIdx_x // 7 < 576: + if T.Mul(52, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(53, 7) + threadIdx_x // 7 < 576: + if T.Mul(53, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(54, 7) + threadIdx_x // 7 < 576: + if T.Mul(54, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(55, 7) + threadIdx_x // 7 < 576: + if T.Mul(55, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(56, 7) + threadIdx_x // 7 < 576: + if T.Mul(56, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(57, 7) + threadIdx_x // 7 < 576: + if T.Mul(57, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(58, 7) + threadIdx_x // 7 < 576: + if T.Mul(58, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(59, 7) + threadIdx_x // 7 < 576: + if T.Mul(59, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(60, 7) + threadIdx_x // 7 < 576: + if T.Mul(60, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(61, 7) + threadIdx_x // 7 < 576: + if T.Mul(61, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(62, 7) + threadIdx_x // 7 < 576: + if T.Mul(62, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(63, 7) + threadIdx_x // 7 < 576: + if T.Mul(63, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(64, 7) + threadIdx_x // 7 < 576: + if T.Mul(64, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(65, 7) + threadIdx_x // 7 < 576: + if T.Mul(65, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(66, 7) + threadIdx_x // 7 < 576: + if T.Mul(66, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(67, 7) + threadIdx_x // 7 < 576: + if T.Mul(67, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(68, 7) + threadIdx_x // 7 < 576: + if T.Mul(68, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(69, 7) + threadIdx_x // 7 < 576: + if T.Mul(69, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(70, 7) + threadIdx_x // 7 < 576: + if T.Mul(70, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(71, 7) + threadIdx_x // 7 < 576: + if T.Mul(71, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(72, 7) + threadIdx_x // 7 < 576: + if T.Mul(72, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(73, 7) + threadIdx_x // 7 < 576: + if T.Mul(73, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(74, 7) + threadIdx_x // 7 < 576: + if T.Mul(74, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(75, 7) + threadIdx_x // 7 < 576: + if T.Mul(75, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(76, 7) + threadIdx_x // 7 < 576: + if T.Mul(76, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(77, 7) + threadIdx_x // 7 < 576: + if T.Mul(77, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(78, 7) + threadIdx_x // 7 < 576: + if T.Mul(78, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(79, 7) + threadIdx_x // 7 < 576: + if T.Mul(79, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(80, 7) + threadIdx_x // 7 < 576: + if T.Mul(80, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(81, 7) + threadIdx_x // 7 < 576: + if T.Mul(81, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if T.Mul(82, 7) + threadIdx_x // 7 < 576: + if T.Mul(82, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +threadIdx_x_1 = T.env_thread("threadIdx.x") +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(0, 49) + threadIdx_x_1 < 1536: + blockIdx_x = T.int32() + kernel_shared[T.Mul(0, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +blockIdx_x = T.int32() +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(1, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(1, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(2, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(2, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(3, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(3, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(4, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(4, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(5, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(5, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(6, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(6, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(7, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(7, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(8, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(8, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(9, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(9, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(10, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(10, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(11, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(11, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(12, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(12, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(13, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(13, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(14, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(14, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(15, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(15, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(16, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(16, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(17, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(17, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(18, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(18, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(19, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(19, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(20, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(20, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(21, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(21, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(22, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(22, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(23, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(23, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(24, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(24, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(25, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(25, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(26, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(26, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(27, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(27, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(28, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(28, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(29, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(29, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(30, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(30, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(31, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(31, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] +for rc_outer_inner in range(8): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + threadIdx_x_2 = T.int32() + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +for rx_outer_outer in range(3): + threadIdx_x = T.env_thread("threadIdx.x") + pad_temp_shared = T.Buffer((4032,), scope="shared") + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + with T.launch_thread(threadIdx_x, 49): + if T.Mul(0, 7) + threadIdx_x // 7 < 576: + if T.Mul(0, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(1, 7) + threadIdx_x // 7 < 576: + if T.Mul(1, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(2, 7) + threadIdx_x // 7 < 576: + if T.Mul(2, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(3, 7) + threadIdx_x // 7 < 576: + if T.Mul(3, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(4, 7) + threadIdx_x // 7 < 576: + if T.Mul(4, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(5, 7) + threadIdx_x // 7 < 576: + if T.Mul(5, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(6, 7) + threadIdx_x // 7 < 576: + if T.Mul(6, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(7, 7) + threadIdx_x // 7 < 576: + if T.Mul(7, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(8, 7) + threadIdx_x // 7 < 576: + if T.Mul(8, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(9, 7) + threadIdx_x // 7 < 576: + if T.Mul(9, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(10, 7) + threadIdx_x // 7 < 576: + if T.Mul(10, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(11, 7) + threadIdx_x // 7 < 576: + if T.Mul(11, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(12, 7) + threadIdx_x // 7 < 576: + if T.Mul(12, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(13, 7) + threadIdx_x // 7 < 576: + if T.Mul(13, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(14, 7) + threadIdx_x // 7 < 576: + if T.Mul(14, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(15, 7) + threadIdx_x // 7 < 576: + if T.Mul(15, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(16, 7) + threadIdx_x // 7 < 576: + if T.Mul(16, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(17, 7) + threadIdx_x // 7 < 576: + if T.Mul(17, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(18, 7) + threadIdx_x // 7 < 576: + if T.Mul(18, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(19, 7) + threadIdx_x // 7 < 576: + if T.Mul(19, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(20, 7) + threadIdx_x // 7 < 576: + if T.Mul(20, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(21, 7) + threadIdx_x // 7 < 576: + if T.Mul(21, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(22, 7) + threadIdx_x // 7 < 576: + if T.Mul(22, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(23, 7) + threadIdx_x // 7 < 576: + if T.Mul(23, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(24, 7) + threadIdx_x // 7 < 576: + if T.Mul(24, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(25, 7) + threadIdx_x // 7 < 576: + if T.Mul(25, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(26, 7) + threadIdx_x // 7 < 576: + if T.Mul(26, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(27, 7) + threadIdx_x // 7 < 576: + if T.Mul(27, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(28, 7) + threadIdx_x // 7 < 576: + if T.Mul(28, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(29, 7) + threadIdx_x // 7 < 576: + if T.Mul(29, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(30, 7) + threadIdx_x // 7 < 576: + if T.Mul(30, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(31, 7) + threadIdx_x // 7 < 576: + if T.Mul(31, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(32, 7) + threadIdx_x // 7 < 576: + if T.Mul(32, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(33, 7) + threadIdx_x // 7 < 576: + if T.Mul(33, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(34, 7) + threadIdx_x // 7 < 576: + if T.Mul(34, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(35, 7) + threadIdx_x // 7 < 576: + if T.Mul(35, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(36, 7) + threadIdx_x // 7 < 576: + if T.Mul(36, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(37, 7) + threadIdx_x // 7 < 576: + if T.Mul(37, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(38, 7) + threadIdx_x // 7 < 576: + if T.Mul(38, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(39, 7) + threadIdx_x // 7 < 576: + if T.Mul(39, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(40, 7) + threadIdx_x // 7 < 576: + if T.Mul(40, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(41, 7) + threadIdx_x // 7 < 576: + if T.Mul(41, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(42, 7) + threadIdx_x // 7 < 576: + if T.Mul(42, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(43, 7) + threadIdx_x // 7 < 576: + if T.Mul(43, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(44, 7) + threadIdx_x // 7 < 576: + if T.Mul(44, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(45, 7) + threadIdx_x // 7 < 576: + if T.Mul(45, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(46, 7) + threadIdx_x // 7 < 576: + if T.Mul(46, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(47, 7) + threadIdx_x // 7 < 576: + if T.Mul(47, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(48, 7) + threadIdx_x // 7 < 576: + if T.Mul(48, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(49, 7) + threadIdx_x // 7 < 576: + if T.Mul(49, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(50, 7) + threadIdx_x // 7 < 576: + if T.Mul(50, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(51, 7) + threadIdx_x // 7 < 576: + if T.Mul(51, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(52, 7) + threadIdx_x // 7 < 576: + if T.Mul(52, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(53, 7) + threadIdx_x // 7 < 576: + if T.Mul(53, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(54, 7) + threadIdx_x // 7 < 576: + if T.Mul(54, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(55, 7) + threadIdx_x // 7 < 576: + if T.Mul(55, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(56, 7) + threadIdx_x // 7 < 576: + if T.Mul(56, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(57, 7) + threadIdx_x // 7 < 576: + if T.Mul(57, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(58, 7) + threadIdx_x // 7 < 576: + if T.Mul(58, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(59, 7) + threadIdx_x // 7 < 576: + if T.Mul(59, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(60, 7) + threadIdx_x // 7 < 576: + if T.Mul(60, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(61, 7) + threadIdx_x // 7 < 576: + if T.Mul(61, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(62, 7) + threadIdx_x // 7 < 576: + if T.Mul(62, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(63, 7) + threadIdx_x // 7 < 576: + if T.Mul(63, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(64, 7) + threadIdx_x // 7 < 576: + if T.Mul(64, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(65, 7) + threadIdx_x // 7 < 576: + if T.Mul(65, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(66, 7) + threadIdx_x // 7 < 576: + if T.Mul(66, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(67, 7) + threadIdx_x // 7 < 576: + if T.Mul(67, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(68, 7) + threadIdx_x // 7 < 576: + if T.Mul(68, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(69, 7) + threadIdx_x // 7 < 576: + if T.Mul(69, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(70, 7) + threadIdx_x // 7 < 576: + if T.Mul(70, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(71, 7) + threadIdx_x // 7 < 576: + if T.Mul(71, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(72, 7) + threadIdx_x // 7 < 576: + if T.Mul(72, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(73, 7) + threadIdx_x // 7 < 576: + if T.Mul(73, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(74, 7) + threadIdx_x // 7 < 576: + if T.Mul(74, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(75, 7) + threadIdx_x // 7 < 576: + if T.Mul(75, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(76, 7) + threadIdx_x // 7 < 576: + if T.Mul(76, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(77, 7) + threadIdx_x // 7 < 576: + if T.Mul(77, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(78, 7) + threadIdx_x // 7 < 576: + if T.Mul(78, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(79, 7) + threadIdx_x // 7 < 576: + if T.Mul(79, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(80, 7) + threadIdx_x // 7 < 576: + if T.Mul(80, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(81, 7) + threadIdx_x // 7 < 576: + if T.Mul(81, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(82, 7) + threadIdx_x // 7 < 576: + if T.Mul(82, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + threadIdx_x_1 = T.env_thread("threadIdx.x") + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(0, 49) + threadIdx_x_1 < 1536: + blockIdx_x = T.int32() + kernel_shared[T.Mul(0, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + blockIdx_x = T.int32() + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(1, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(1, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(2, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(2, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(3, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(3, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(4, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(4, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(5, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(5, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(6, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(6, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(7, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(7, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(8, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(8, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(9, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(9, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(10, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(10, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(11, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(11, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(12, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(12, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(13, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(13, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(14, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(14, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(15, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(15, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(16, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(16, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(17, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(17, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(18, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(18, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(19, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(19, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(20, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(20, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(21, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(21, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(22, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(22, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(23, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(23, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(24, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(24, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(25, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(25, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(26, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(26, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(27, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(27, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(28, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(28, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(29, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(29, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(30, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(30, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(31, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(31, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + for rc_outer_inner in range(8): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + threadIdx_x_2 = T.int32() + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x = T.env_thread("threadIdx.x") + pad_temp_shared = T.Buffer((4032,), scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(0, 7) + threadIdx_x // 7 < 576: + if T.Mul(0, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(1, 7) + threadIdx_x // 7 < 576: + if T.Mul(1, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(2, 7) + threadIdx_x // 7 < 576: + if T.Mul(2, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(3, 7) + threadIdx_x // 7 < 576: + if T.Mul(3, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(4, 7) + threadIdx_x // 7 < 576: + if T.Mul(4, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(5, 7) + threadIdx_x // 7 < 576: + if T.Mul(5, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(6, 7) + threadIdx_x // 7 < 576: + if T.Mul(6, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(7, 7) + threadIdx_x // 7 < 576: + if T.Mul(7, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(8, 7) + threadIdx_x // 7 < 576: + if T.Mul(8, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(9, 7) + threadIdx_x // 7 < 576: + if T.Mul(9, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(10, 7) + threadIdx_x // 7 < 576: + if T.Mul(10, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(11, 7) + threadIdx_x // 7 < 576: + if T.Mul(11, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(12, 7) + threadIdx_x // 7 < 576: + if T.Mul(12, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(13, 7) + threadIdx_x // 7 < 576: + if T.Mul(13, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(14, 7) + threadIdx_x // 7 < 576: + if T.Mul(14, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(15, 7) + threadIdx_x // 7 < 576: + if T.Mul(15, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(16, 7) + threadIdx_x // 7 < 576: + if T.Mul(16, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(17, 7) + threadIdx_x // 7 < 576: + if T.Mul(17, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(18, 7) + threadIdx_x // 7 < 576: + if T.Mul(18, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(19, 7) + threadIdx_x // 7 < 576: + if T.Mul(19, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(20, 7) + threadIdx_x // 7 < 576: + if T.Mul(20, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(21, 7) + threadIdx_x // 7 < 576: + if T.Mul(21, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(22, 7) + threadIdx_x // 7 < 576: + if T.Mul(22, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(23, 7) + threadIdx_x // 7 < 576: + if T.Mul(23, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(24, 7) + threadIdx_x // 7 < 576: + if T.Mul(24, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(25, 7) + threadIdx_x // 7 < 576: + if T.Mul(25, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(26, 7) + threadIdx_x // 7 < 576: + if T.Mul(26, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(27, 7) + threadIdx_x // 7 < 576: + if T.Mul(27, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(28, 7) + threadIdx_x // 7 < 576: + if T.Mul(28, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(29, 7) + threadIdx_x // 7 < 576: + if T.Mul(29, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(30, 7) + threadIdx_x // 7 < 576: + if T.Mul(30, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(31, 7) + threadIdx_x // 7 < 576: + if T.Mul(31, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(32, 7) + threadIdx_x // 7 < 576: + if T.Mul(32, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(33, 7) + threadIdx_x // 7 < 576: + if T.Mul(33, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(34, 7) + threadIdx_x // 7 < 576: + if T.Mul(34, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(35, 7) + threadIdx_x // 7 < 576: + if T.Mul(35, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(36, 7) + threadIdx_x // 7 < 576: + if T.Mul(36, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(37, 7) + threadIdx_x // 7 < 576: + if T.Mul(37, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(38, 7) + threadIdx_x // 7 < 576: + if T.Mul(38, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(39, 7) + threadIdx_x // 7 < 576: + if T.Mul(39, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(40, 7) + threadIdx_x // 7 < 576: + if T.Mul(40, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(41, 7) + threadIdx_x // 7 < 576: + if T.Mul(41, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(42, 7) + threadIdx_x // 7 < 576: + if T.Mul(42, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(43, 7) + threadIdx_x // 7 < 576: + if T.Mul(43, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(44, 7) + threadIdx_x // 7 < 576: + if T.Mul(44, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(45, 7) + threadIdx_x // 7 < 576: + if T.Mul(45, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(46, 7) + threadIdx_x // 7 < 576: + if T.Mul(46, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(47, 7) + threadIdx_x // 7 < 576: + if T.Mul(47, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(48, 7) + threadIdx_x // 7 < 576: + if T.Mul(48, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(49, 7) + threadIdx_x // 7 < 576: + if T.Mul(49, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(50, 7) + threadIdx_x // 7 < 576: + if T.Mul(50, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(51, 7) + threadIdx_x // 7 < 576: + if T.Mul(51, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(52, 7) + threadIdx_x // 7 < 576: + if T.Mul(52, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(53, 7) + threadIdx_x // 7 < 576: + if T.Mul(53, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(54, 7) + threadIdx_x // 7 < 576: + if T.Mul(54, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(55, 7) + threadIdx_x // 7 < 576: + if T.Mul(55, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(56, 7) + threadIdx_x // 7 < 576: + if T.Mul(56, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(57, 7) + threadIdx_x // 7 < 576: + if T.Mul(57, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(58, 7) + threadIdx_x // 7 < 576: + if T.Mul(58, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(59, 7) + threadIdx_x // 7 < 576: + if T.Mul(59, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(60, 7) + threadIdx_x // 7 < 576: + if T.Mul(60, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(61, 7) + threadIdx_x // 7 < 576: + if T.Mul(61, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(62, 7) + threadIdx_x // 7 < 576: + if T.Mul(62, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(63, 7) + threadIdx_x // 7 < 576: + if T.Mul(63, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(64, 7) + threadIdx_x // 7 < 576: + if T.Mul(64, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(65, 7) + threadIdx_x // 7 < 576: + if T.Mul(65, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(66, 7) + threadIdx_x // 7 < 576: + if T.Mul(66, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(67, 7) + threadIdx_x // 7 < 576: + if T.Mul(67, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(68, 7) + threadIdx_x // 7 < 576: + if T.Mul(68, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(69, 7) + threadIdx_x // 7 < 576: + if T.Mul(69, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(70, 7) + threadIdx_x // 7 < 576: + if T.Mul(70, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(71, 7) + threadIdx_x // 7 < 576: + if T.Mul(71, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(72, 7) + threadIdx_x // 7 < 576: + if T.Mul(72, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(73, 7) + threadIdx_x // 7 < 576: + if T.Mul(73, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(74, 7) + threadIdx_x // 7 < 576: + if T.Mul(74, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(75, 7) + threadIdx_x // 7 < 576: + if T.Mul(75, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(76, 7) + threadIdx_x // 7 < 576: + if T.Mul(76, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(77, 7) + threadIdx_x // 7 < 576: + if T.Mul(77, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(78, 7) + threadIdx_x // 7 < 576: + if T.Mul(78, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(79, 7) + threadIdx_x // 7 < 576: + if T.Mul(79, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(80, 7) + threadIdx_x // 7 < 576: + if T.Mul(80, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(81, 7) + threadIdx_x // 7 < 576: + if T.Mul(81, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(82, 7) + threadIdx_x // 7 < 576: + if T.Mul(82, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + threadIdx_x_1 = T.env_thread("threadIdx.x") + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(0, 49) + threadIdx_x_1 < 1536: + blockIdx_x = T.int32() + kernel_shared[T.Mul(0, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + blockIdx_x = T.int32() + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(1, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(1, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(2, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(2, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(3, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(3, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(4, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(4, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(5, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(5, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(6, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(6, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(7, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(7, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(8, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(8, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(9, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(9, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(10, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(10, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(11, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(11, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(12, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(12, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(13, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(13, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(14, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(14, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(15, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(15, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(16, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(16, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(17, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(17, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(18, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(18, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(19, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(19, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(20, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(20, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(21, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(21, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(22, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(22, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(23, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(23, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(24, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(24, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(25, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(25, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(26, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(26, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(27, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(27, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(28, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(28, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(29, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(29, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(30, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(30, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(31, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(31, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + for rc_outer_inner in range(8): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + threadIdx_x_2 = T.int32() + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + +0 + +8 + +i1_inner + +conv2d_nchw[i1_inner] + +8 + +blockIdx_x * 8 + +blockIdx_x * 8 + i1_inner + +bias[blockIdx_x * 8 + i1_inner] + +conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner] + +T.float32(0.0) + +T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +392 + +blockIdx_x * 392 + +49 + +i1_inner * 49 + +blockIdx_x * 392 + i1_inner * 49 + +blockIdx_x * 392 + i1_inner * 49 + threadIdx_x + +compute = T.Buffer((25088,)) +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +i1_inner = T.int32() +bias = T.Buffer((512,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +for i1_inner in range(8): + compute = T.Buffer((25088,)) + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + bias = T.Buffer((512,)) + blockIdx_x = T.int32() + threadIdx_x = T.int32() + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) +conv2d_nchw[T.Mul(0, 4) + 1] = T.float32(0.0) +conv2d_nchw[T.Mul(0, 4) + 2] = T.float32(0.0) +conv2d_nchw[T.Mul(0, 4) + 3] = T.float32(0.0) +conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) +conv2d_nchw[T.Mul(1, 4) + 1] = T.float32(0.0) +conv2d_nchw[T.Mul(1, 4) + 2] = T.float32(0.0) +conv2d_nchw[T.Mul(1, 4) + 3] = T.float32(0.0) +blockIdx_x = T.int32() +threadIdx_x_2 = T.int32() +for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x = T.env_thread("threadIdx.x") + pad_temp_shared = T.Buffer((4032,), scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(0, 7) + threadIdx_x // 7 < 576: + if T.Mul(0, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(1, 7) + threadIdx_x // 7 < 576: + if T.Mul(1, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(2, 7) + threadIdx_x // 7 < 576: + if T.Mul(2, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(3, 7) + threadIdx_x // 7 < 576: + if T.Mul(3, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(4, 7) + threadIdx_x // 7 < 576: + if T.Mul(4, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(5, 7) + threadIdx_x // 7 < 576: + if T.Mul(5, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(6, 7) + threadIdx_x // 7 < 576: + if T.Mul(6, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(7, 7) + threadIdx_x // 7 < 576: + if T.Mul(7, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(8, 7) + threadIdx_x // 7 < 576: + if T.Mul(8, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(9, 7) + threadIdx_x // 7 < 576: + if T.Mul(9, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(10, 7) + threadIdx_x // 7 < 576: + if T.Mul(10, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(11, 7) + threadIdx_x // 7 < 576: + if T.Mul(11, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(12, 7) + threadIdx_x // 7 < 576: + if T.Mul(12, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(13, 7) + threadIdx_x // 7 < 576: + if T.Mul(13, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(14, 7) + threadIdx_x // 7 < 576: + if T.Mul(14, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(15, 7) + threadIdx_x // 7 < 576: + if T.Mul(15, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(16, 7) + threadIdx_x // 7 < 576: + if T.Mul(16, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(17, 7) + threadIdx_x // 7 < 576: + if T.Mul(17, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(18, 7) + threadIdx_x // 7 < 576: + if T.Mul(18, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(19, 7) + threadIdx_x // 7 < 576: + if T.Mul(19, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(20, 7) + threadIdx_x // 7 < 576: + if T.Mul(20, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(21, 7) + threadIdx_x // 7 < 576: + if T.Mul(21, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(22, 7) + threadIdx_x // 7 < 576: + if T.Mul(22, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(23, 7) + threadIdx_x // 7 < 576: + if T.Mul(23, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(24, 7) + threadIdx_x // 7 < 576: + if T.Mul(24, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(25, 7) + threadIdx_x // 7 < 576: + if T.Mul(25, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(26, 7) + threadIdx_x // 7 < 576: + if T.Mul(26, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(27, 7) + threadIdx_x // 7 < 576: + if T.Mul(27, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(28, 7) + threadIdx_x // 7 < 576: + if T.Mul(28, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(29, 7) + threadIdx_x // 7 < 576: + if T.Mul(29, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(30, 7) + threadIdx_x // 7 < 576: + if T.Mul(30, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(31, 7) + threadIdx_x // 7 < 576: + if T.Mul(31, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(32, 7) + threadIdx_x // 7 < 576: + if T.Mul(32, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(33, 7) + threadIdx_x // 7 < 576: + if T.Mul(33, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(34, 7) + threadIdx_x // 7 < 576: + if T.Mul(34, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(35, 7) + threadIdx_x // 7 < 576: + if T.Mul(35, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(36, 7) + threadIdx_x // 7 < 576: + if T.Mul(36, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(37, 7) + threadIdx_x // 7 < 576: + if T.Mul(37, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(38, 7) + threadIdx_x // 7 < 576: + if T.Mul(38, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(39, 7) + threadIdx_x // 7 < 576: + if T.Mul(39, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(40, 7) + threadIdx_x // 7 < 576: + if T.Mul(40, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(41, 7) + threadIdx_x // 7 < 576: + if T.Mul(41, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(42, 7) + threadIdx_x // 7 < 576: + if T.Mul(42, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(43, 7) + threadIdx_x // 7 < 576: + if T.Mul(43, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(44, 7) + threadIdx_x // 7 < 576: + if T.Mul(44, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(45, 7) + threadIdx_x // 7 < 576: + if T.Mul(45, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(46, 7) + threadIdx_x // 7 < 576: + if T.Mul(46, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(47, 7) + threadIdx_x // 7 < 576: + if T.Mul(47, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(48, 7) + threadIdx_x // 7 < 576: + if T.Mul(48, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(49, 7) + threadIdx_x // 7 < 576: + if T.Mul(49, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(50, 7) + threadIdx_x // 7 < 576: + if T.Mul(50, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(51, 7) + threadIdx_x // 7 < 576: + if T.Mul(51, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(52, 7) + threadIdx_x // 7 < 576: + if T.Mul(52, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(53, 7) + threadIdx_x // 7 < 576: + if T.Mul(53, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(54, 7) + threadIdx_x // 7 < 576: + if T.Mul(54, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(55, 7) + threadIdx_x // 7 < 576: + if T.Mul(55, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(56, 7) + threadIdx_x // 7 < 576: + if T.Mul(56, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(57, 7) + threadIdx_x // 7 < 576: + if T.Mul(57, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(58, 7) + threadIdx_x // 7 < 576: + if T.Mul(58, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(59, 7) + threadIdx_x // 7 < 576: + if T.Mul(59, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(60, 7) + threadIdx_x // 7 < 576: + if T.Mul(60, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(61, 7) + threadIdx_x // 7 < 576: + if T.Mul(61, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(62, 7) + threadIdx_x // 7 < 576: + if T.Mul(62, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(63, 7) + threadIdx_x // 7 < 576: + if T.Mul(63, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(64, 7) + threadIdx_x // 7 < 576: + if T.Mul(64, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(65, 7) + threadIdx_x // 7 < 576: + if T.Mul(65, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(66, 7) + threadIdx_x // 7 < 576: + if T.Mul(66, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(67, 7) + threadIdx_x // 7 < 576: + if T.Mul(67, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(68, 7) + threadIdx_x // 7 < 576: + if T.Mul(68, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(69, 7) + threadIdx_x // 7 < 576: + if T.Mul(69, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(70, 7) + threadIdx_x // 7 < 576: + if T.Mul(70, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(71, 7) + threadIdx_x // 7 < 576: + if T.Mul(71, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(72, 7) + threadIdx_x // 7 < 576: + if T.Mul(72, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(73, 7) + threadIdx_x // 7 < 576: + if T.Mul(73, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(74, 7) + threadIdx_x // 7 < 576: + if T.Mul(74, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(75, 7) + threadIdx_x // 7 < 576: + if T.Mul(75, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(76, 7) + threadIdx_x // 7 < 576: + if T.Mul(76, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(77, 7) + threadIdx_x // 7 < 576: + if T.Mul(77, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(78, 7) + threadIdx_x // 7 < 576: + if T.Mul(78, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(79, 7) + threadIdx_x // 7 < 576: + if T.Mul(79, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(80, 7) + threadIdx_x // 7 < 576: + if T.Mul(80, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(81, 7) + threadIdx_x // 7 < 576: + if T.Mul(81, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if T.Mul(82, 7) + threadIdx_x // 7 < 576: + if T.Mul(82, 49) + threadIdx_x < 4032: + pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + threadIdx_x_1 = T.env_thread("threadIdx.x") + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(0, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(0, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(1, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(1, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(2, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(2, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(3, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(3, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(4, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(4, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(5, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(5, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(6, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(6, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(7, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(7, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(8, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(8, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(9, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(9, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(10, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(10, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(11, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(11, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(12, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(12, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(13, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(13, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(14, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(14, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(15, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(15, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(16, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(16, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(17, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(17, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(18, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(18, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(19, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(19, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(20, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(20, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(21, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(21, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(22, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(22, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(23, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(23, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(24, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(24, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(25, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(25, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(26, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(26, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(27, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(27, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(28, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(28, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(29, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(29, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(30, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(30, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(31, 49) + threadIdx_x_1 < 1536: + kernel_shared[T.Mul(31, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] + for rc_outer_inner in range(8): + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] +for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x_2] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) + conv2d_nchw[T.Mul(0, 4) + 1] = T.float32(0.0) + conv2d_nchw[T.Mul(0, 4) + 2] = T.float32(0.0) + conv2d_nchw[T.Mul(0, 4) + 3] = T.float32(0.0) + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) + conv2d_nchw[T.Mul(1, 4) + 1] = T.float32(0.0) + conv2d_nchw[T.Mul(1, 4) + 2] = T.float32(0.0) + conv2d_nchw[T.Mul(1, 4) + 3] = T.float32(0.0) + blockIdx_x = T.int32() + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared = T.Buffer((4032,), scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(0, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(0, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(0, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(1, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(1, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(1, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(2, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(2, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(2, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(3, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(3, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(3, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(4, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(4, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(4, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(5, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(5, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(5, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(6, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(6, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(6, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(7, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(7, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(7, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(8, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(8, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(8, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(9, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(9, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(9, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(10, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(10, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(10, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(11, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(11, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(11, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(12, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(12, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(12, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(13, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(13, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(13, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(14, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(14, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(14, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(15, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(15, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(15, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(16, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(16, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(16, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(17, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(17, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(17, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(18, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(18, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(18, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(19, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(19, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(19, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(20, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(20, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(20, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(21, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(21, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(21, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(22, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(22, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(22, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(23, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(23, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(23, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(24, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(24, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(24, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(25, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(25, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(25, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(26, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(26, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(26, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(27, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(27, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(27, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(28, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(28, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(28, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(29, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(29, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(29, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(30, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(30, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(30, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(31, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(31, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(31, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(32, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(32, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(32, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(33, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(33, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(33, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(34, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(34, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(34, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(35, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(35, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(35, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(36, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(36, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(36, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(37, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(37, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(37, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(38, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(38, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(38, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(39, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(39, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(39, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(40, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(40, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(40, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(41, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(41, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(41, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(42, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(42, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(42, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(43, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(43, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(43, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(44, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(44, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(44, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(45, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(45, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(45, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(46, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(46, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(46, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(47, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(47, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(47, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(48, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(48, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(48, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(49, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(49, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(49, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(50, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(50, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(50, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(51, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(51, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(51, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(52, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(52, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(52, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(53, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(53, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(53, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(54, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(54, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(54, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(55, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(55, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(55, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(56, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(56, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(56, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(57, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(57, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(57, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(58, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(58, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(58, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(59, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(59, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(59, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(60, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(60, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(60, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(61, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(61, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(61, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(62, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(62, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(62, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(63, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(63, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(63, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(64, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(64, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(64, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(65, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(65, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(65, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(66, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(66, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(66, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(67, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(67, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(67, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(68, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(68, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(68, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(69, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(69, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(69, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(70, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(70, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(70, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(71, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(71, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(71, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(72, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(72, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(72, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(73, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(73, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(73, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(74, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(74, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(74, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(75, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(75, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(75, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(76, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(76, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(76, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(77, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(77, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(77, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(78, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(78, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(78, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(79, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(79, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(79, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(80, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(80, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(80, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(81, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(81, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(81, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(82, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(82, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(82, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(0, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(0, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(1, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(1, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(2, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(2, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(3, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(3, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(4, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(4, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(5, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(5, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(6, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(6, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(7, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(7, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(8, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(8, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(9, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(9, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(10, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(10, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(11, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(11, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(12, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(12, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(13, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(13, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(14, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(14, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(15, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(15, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(16, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(16, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(17, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(17, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(18, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(18, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(19, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(19, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(20, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(20, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(21, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(21, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(22, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(22, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(23, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(23, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(24, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(24, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(25, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(25, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(26, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(26, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(27, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(27, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(28, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(28, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(29, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(29, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(30, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(30, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(31, 49) + threadIdx_x_2 < 1536: + kernel_shared[T.Mul(31, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + for rc_outer_inner in range(8): + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +T.bool(True) + +with T.allocate([1536], "float32", "shared") as kernel_shared: + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) + conv2d_nchw[T.Mul(0, 4) + 1] = T.float32(0.0) + conv2d_nchw[T.Mul(0, 4) + 2] = T.float32(0.0) + conv2d_nchw[T.Mul(0, 4) + 3] = T.float32(0.0) + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) + conv2d_nchw[T.Mul(1, 4) + 1] = T.float32(0.0) + conv2d_nchw[T.Mul(1, 4) + 2] = T.float32(0.0) + conv2d_nchw[T.Mul(1, 4) + 3] = T.float32(0.0) + blockIdx_x = T.int32() + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared = T.Buffer((4032,), scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(0, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(0, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(0, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(1, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(1, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(1, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(2, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(2, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(2, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(3, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(3, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(3, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(4, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(4, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(4, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(5, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(5, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(5, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(6, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(6, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(6, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(7, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(7, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(7, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(8, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(8, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(8, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(9, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(9, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(9, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(10, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(10, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(10, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(11, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(11, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(11, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(12, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(12, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(12, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(13, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(13, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(13, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(14, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(14, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(14, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(15, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(15, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(15, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(16, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(16, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(16, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(17, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(17, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(17, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(18, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(18, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(18, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(19, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(19, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(19, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(20, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(20, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(20, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(21, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(21, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(21, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(22, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(22, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(22, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(23, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(23, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(23, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(24, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(24, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(24, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(25, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(25, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(25, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(26, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(26, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(26, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(27, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(27, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(27, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(28, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(28, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(28, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(29, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(29, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(29, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(30, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(30, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(30, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(31, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(31, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(31, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(32, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(32, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(32, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(33, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(33, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(33, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(34, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(34, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(34, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(35, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(35, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(35, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(36, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(36, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(36, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(37, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(37, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(37, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(38, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(38, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(38, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(39, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(39, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(39, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(40, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(40, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(40, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(41, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(41, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(41, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(42, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(42, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(42, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(43, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(43, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(43, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(44, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(44, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(44, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(45, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(45, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(45, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(46, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(46, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(46, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(47, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(47, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(47, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(48, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(48, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(48, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(49, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(49, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(49, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(50, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(50, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(50, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(51, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(51, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(51, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(52, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(52, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(52, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(53, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(53, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(53, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(54, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(54, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(54, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(55, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(55, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(55, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(56, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(56, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(56, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(57, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(57, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(57, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(58, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(58, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(58, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(59, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(59, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(59, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(60, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(60, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(60, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(61, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(61, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(61, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(62, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(62, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(62, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(63, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(63, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(63, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(64, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(64, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(64, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(65, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(65, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(65, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(66, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(66, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(66, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(67, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(67, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(67, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(68, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(68, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(68, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(69, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(69, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(69, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(70, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(70, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(70, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(71, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(71, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(71, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(72, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(72, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(72, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(73, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(73, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(73, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(74, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(74, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(74, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(75, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(75, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(75, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(76, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(76, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(76, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(77, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(77, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(77, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(78, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(78, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(78, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(79, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(79, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(79, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(80, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(80, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(80, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(81, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(81, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(81, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(82, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(82, 49) + threadIdx_x_1 < 4032: + pad_temp_shared[T.Mul(82, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(0, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(0, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(1, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(1, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(2, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(2, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(3, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(3, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(4, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(4, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(5, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(5, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(6, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(6, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(7, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(7, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(8, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(8, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(9, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(9, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(10, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(10, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(11, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(11, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(12, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(12, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(13, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(13, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(14, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(14, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(15, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(15, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(16, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(16, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(17, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(17, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(18, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(18, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(19, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(19, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(20, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(20, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(21, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(21, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(22, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(22, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(23, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(23, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(24, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(24, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(25, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(25, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(26, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(26, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(27, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(27, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(28, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(28, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(29, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(29, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(30, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(30, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(31, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(31, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + for rc_outer_inner in range(8): + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +T.bool(True) + +with T.allocate([4032], "float32", "shared") as pad_temp_shared: + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) + conv2d_nchw[T.Mul(0, 4) + 1] = T.float32(0.0) + conv2d_nchw[T.Mul(0, 4) + 2] = T.float32(0.0) + conv2d_nchw[T.Mul(0, 4) + 3] = T.float32(0.0) + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) + conv2d_nchw[T.Mul(1, 4) + 1] = T.float32(0.0) + conv2d_nchw[T.Mul(1, 4) + 2] = T.float32(0.0) + conv2d_nchw[T.Mul(1, 4) + 3] = T.float32(0.0) + blockIdx_x = T.int32() + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(0, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(0, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(0, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(1, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(1, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(1, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(2, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(2, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(2, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(3, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(3, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(3, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(4, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(4, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(4, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(5, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(5, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(5, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(6, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(6, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(6, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(7, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(7, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(7, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(8, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(8, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(8, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(9, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(9, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(9, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(10, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(10, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(10, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(11, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(11, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(11, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(12, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(12, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(12, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(13, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(13, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(13, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(14, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(14, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(14, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(15, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(15, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(15, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(16, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(16, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(16, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(17, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(17, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(17, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(18, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(18, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(18, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(19, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(19, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(19, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(20, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(20, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(20, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(21, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(21, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(21, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(22, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(22, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(22, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(23, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(23, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(23, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(24, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(24, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(24, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(25, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(25, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(25, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(26, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(26, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(26, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(27, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(27, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(27, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(28, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(28, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(28, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(29, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(29, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(29, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(30, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(30, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(30, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(31, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(31, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(31, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(32, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(32, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(32, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(33, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(33, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(33, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(34, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(34, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(34, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(35, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(35, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(35, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(36, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(36, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(36, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(37, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(37, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(37, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(38, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(38, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(38, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(39, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(39, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(39, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(40, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(40, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(40, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(41, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(41, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(41, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(42, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(42, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(42, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(43, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(43, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(43, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(44, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(44, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(44, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(45, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(45, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(45, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(46, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(46, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(46, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(47, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(47, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(47, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(48, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(48, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(48, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(49, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(49, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(49, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(50, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(50, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(50, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(51, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(51, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(51, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(52, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(52, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(52, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(53, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(53, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(53, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(54, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(54, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(54, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(55, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(55, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(55, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(56, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(56, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(56, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(57, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(57, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(57, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(58, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(58, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(58, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(59, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(59, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(59, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(60, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(60, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(60, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(61, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(61, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(61, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(62, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(62, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(62, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(63, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(63, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(63, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(64, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(64, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(64, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(65, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(65, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(65, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(66, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(66, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(66, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(67, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(67, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(67, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(68, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(68, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(68, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(69, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(69, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(69, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(70, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(70, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(70, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(71, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(71, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(71, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(72, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(72, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(72, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(73, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(73, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(73, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(74, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(74, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(74, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(75, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(75, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(75, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(76, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(76, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(76, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(77, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(77, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(77, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(78, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(78, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(78, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(79, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(79, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(79, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(80, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(80, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(80, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(81, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(81, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(81, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(82, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(82, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(82, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(0, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(0, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(1, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(1, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(2, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(2, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(3, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(3, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(4, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(4, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(5, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(5, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(6, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(6, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(7, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(7, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(8, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(8, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(9, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(9, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(10, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(10, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(11, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(11, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(12, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(12, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(13, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(13, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(14, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(14, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(15, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(15, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(16, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(16, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(17, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(17, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(18, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(18, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(19, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(19, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(20, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(20, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(21, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(21, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(22, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(22, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(23, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(23, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(24, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(24, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(25, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(25, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(26, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(26, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(27, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(27, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(28, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(28, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(29, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(29, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(30, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(30, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(31, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(31, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + for rc_outer_inner in range(8): + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +T.bool(True) + +with T.allocate([8], "float32", "local") as conv2d_nchw: + pad_temp_shared = T.allocate([4032], "float32", "shared") + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) + conv2d_nchw_1[T.Mul(0, 4) + 1] = T.float32(0.0) + conv2d_nchw_1[T.Mul(0, 4) + 2] = T.float32(0.0) + conv2d_nchw_1[T.Mul(0, 4) + 3] = T.float32(0.0) + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) + conv2d_nchw_1[T.Mul(1, 4) + 1] = T.float32(0.0) + conv2d_nchw_1[T.Mul(1, 4) + 2] = T.float32(0.0) + conv2d_nchw_1[T.Mul(1, 4) + 3] = T.float32(0.0) + blockIdx_x = T.int32() + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(0, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(0, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(0, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(1, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(1, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(1, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(2, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(2, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(2, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(3, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(3, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(3, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(4, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(4, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(4, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(5, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(5, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(5, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(6, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(6, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(6, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(7, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(7, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(7, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(8, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(8, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(8, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(9, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(9, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(9, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(10, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(10, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(10, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(11, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(11, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(11, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(12, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(12, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(12, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(13, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(13, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(13, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(14, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(14, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(14, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(15, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(15, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(15, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(16, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(16, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(16, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(17, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(17, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(17, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(18, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(18, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(18, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(19, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(19, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(19, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(20, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(20, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(20, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(21, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(21, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(21, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(22, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(22, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(22, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(23, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(23, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(23, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(24, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(24, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(24, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(25, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(25, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(25, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(26, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(26, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(26, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(27, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(27, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(27, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(28, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(28, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(28, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(29, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(29, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(29, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(30, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(30, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(30, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(31, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(31, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(31, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(32, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(32, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(32, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(33, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(33, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(33, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(34, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(34, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(34, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(35, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(35, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(35, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(36, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(36, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(36, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(37, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(37, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(37, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(38, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(38, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(38, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(39, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(39, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(39, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(40, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(40, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(40, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(41, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(41, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(41, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(42, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(42, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(42, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(43, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(43, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(43, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(44, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(44, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(44, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(45, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(45, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(45, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(46, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(46, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(46, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(47, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(47, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(47, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(48, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(48, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(48, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(49, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(49, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(49, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(50, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(50, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(50, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(51, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(51, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(51, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(52, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(52, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(52, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(53, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(53, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(53, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(54, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(54, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(54, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(55, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(55, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(55, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(56, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(56, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(56, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(57, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(57, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(57, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(58, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(58, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(58, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(59, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(59, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(59, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(60, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(60, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(60, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(61, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(61, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(61, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(62, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(62, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(62, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(63, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(63, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(63, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(64, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(64, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(64, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(65, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(65, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(65, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(66, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(66, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(66, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(67, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(67, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(67, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(68, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(68, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(68, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(69, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(69, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(69, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(70, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(70, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(70, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(71, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(71, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(71, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(72, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(72, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(72, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(73, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(73, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(73, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(74, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(74, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(74, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(75, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(75, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(75, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(76, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(76, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(76, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(77, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(77, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(77, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(78, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(78, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(78, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(79, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(79, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(79, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(80, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(80, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(80, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(81, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(81, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(81, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(82, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(82, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(82, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(0, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(0, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(1, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(1, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(2, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(2, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(3, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(3, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(4, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(4, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(5, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(5, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(6, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(6, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(7, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(7, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(8, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(8, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(9, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(9, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(10, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(10, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(11, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(11, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(12, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(12, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(13, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(13, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(14, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(14, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(15, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(15, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(16, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(16, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(17, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(17, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(18, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(18, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(19, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(19, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(20, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(20, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(21, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(21, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(22, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(22, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(23, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(23, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(24, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(24, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(25, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(25, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(26, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(26, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(27, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(27, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(28, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(28, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(29, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(29, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(30, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(30, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(31, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(31, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + for rc_outer_inner in range(8): + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +with T.launch_thread("blockIdx.x", 64) as blockIdx_x: + conv2d_nchw = T.allocate([8], "float32", "local") + pad_temp_shared = T.allocate([4032], "float32", "shared") + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) + conv2d_nchw_1[T.Mul(0, 4) + 1] = T.float32(0.0) + conv2d_nchw_1[T.Mul(0, 4) + 2] = T.float32(0.0) + conv2d_nchw_1[T.Mul(0, 4) + 3] = T.float32(0.0) + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) + conv2d_nchw_1[T.Mul(1, 4) + 1] = T.float32(0.0) + conv2d_nchw_1[T.Mul(1, 4) + 2] = T.float32(0.0) + conv2d_nchw_1[T.Mul(1, 4) + 3] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(0, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(0, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(0, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(1, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(1, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(1, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(2, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(2, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(2, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(3, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(3, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(3, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(4, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(4, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(4, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(5, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(5, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(5, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(6, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(6, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(6, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(7, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(7, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(7, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(8, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(8, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(8, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(9, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(9, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(9, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(10, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(10, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(10, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(11, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(11, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(11, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(12, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(12, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(12, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(13, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(13, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(13, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(14, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(14, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(14, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(15, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(15, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(15, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(16, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(16, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(16, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(17, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(17, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(17, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(18, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(18, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(18, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(19, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(19, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(19, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(20, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(20, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(20, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(21, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(21, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(21, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(22, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(22, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(22, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(23, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(23, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(23, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(24, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(24, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(24, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(25, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(25, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(25, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(26, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(26, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(26, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(27, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(27, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(27, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(28, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(28, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(28, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(29, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(29, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(29, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(30, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(30, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(30, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(31, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(31, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(31, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(32, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(32, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(32, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(33, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(33, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(33, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(34, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(34, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(34, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(35, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(35, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(35, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(36, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(36, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(36, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(37, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(37, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(37, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(38, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(38, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(38, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(39, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(39, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(39, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(40, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(40, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(40, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(41, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(41, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(41, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(42, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(42, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(42, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(43, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(43, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(43, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(44, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(44, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(44, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(45, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(45, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(45, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(46, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(46, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(46, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(47, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(47, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(47, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(48, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(48, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(48, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(49, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(49, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(49, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(50, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(50, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(50, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(51, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(51, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(51, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(52, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(52, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(52, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(53, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(53, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(53, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(54, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(54, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(54, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(55, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(55, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(55, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(56, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(56, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(56, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(57, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(57, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(57, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(58, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(58, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(58, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(59, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(59, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(59, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(60, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(60, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(60, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(61, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(61, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(61, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(62, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(62, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(62, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(63, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(63, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(63, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(64, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(64, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(64, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(65, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(65, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(65, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(66, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(66, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(66, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(67, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(67, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(67, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(68, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(68, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(68, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(69, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(69, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(69, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(70, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(70, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(70, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(71, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(71, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(71, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(72, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(72, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(72, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(73, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(73, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(73, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(74, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(74, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(74, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(75, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(75, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(75, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(76, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(76, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(76, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(77, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(77, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(77, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(78, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(78, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(78, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(79, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(79, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(79, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(80, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(80, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(80, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(81, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(81, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(81, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.Mul(82, 7) + threadIdx_x_1 // 7 < 576: + if T.Mul(82, 49) + threadIdx_x_1 < 4032: + pad_temp_shared_1[T.Mul(82, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(0, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(0, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(1, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(1, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(2, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(2, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(3, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(3, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(4, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(4, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(5, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(5, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(6, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(6, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(7, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(7, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(8, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(8, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(9, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(9, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(10, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(10, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(11, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(11, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(12, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(12, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(13, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(13, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(14, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(14, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(15, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(15, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(16, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(16, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(17, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(17, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(18, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(18, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(19, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(19, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(20, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(20, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(21, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(21, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(22, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(22, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(23, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(23, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(24, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(24, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(25, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(25, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(26, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(26, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(27, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(27, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(28, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(28, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(29, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(29, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(30, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(30, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + if T.Mul(31, 49) + threadIdx_x_2 < 1536: + kernel_shared_1[T.Mul(31, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] + for rc_outer_inner in range(8): + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] + conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +# from tvm.script import ir as I +# from tvm.script import tir as T + +@I.ir_module +class Module: + @T.prim_func + def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")): + T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)}) + blockIdx_x = T.launch_thread("blockIdx.x", 64) + conv2d_nchw = T.allocate([8], "float32", "local") + pad_temp_shared = T.allocate([4032], "float32", "shared") + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + conv2d_nchw_1[0] = T.float32(0.0) + conv2d_nchw_1[1] = T.float32(0.0) + conv2d_nchw_1[2] = T.float32(0.0) + conv2d_nchw_1[3] = T.float32(0.0) + conv2d_nchw_1[4] = T.float32(0.0) + conv2d_nchw_1[5] = T.float32(0.0) + conv2d_nchw_1[6] = T.float32(0.0) + conv2d_nchw_1[7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + cse_var_2: T.int32 = rc_outer_outer * 3136 + cse_var_1: T.int32 = rc_outer_outer * 576 + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data_1 = T.Buffer((25088,), data=data.data) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 14: + pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel_1 = T.Buffer((2359296,), data=kernel.data) + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 49] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 98] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 147] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 196] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 245] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 294] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 343] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 441] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 490] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 539] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 588] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 637] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 686] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 735] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 833] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 882] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 931] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 980] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1029] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1078] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1127] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1176] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1225] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1274] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1323] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1372] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1421] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1470] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_2, 49): + if threadIdx_x_2 < 17: + kernel_shared_1[threadIdx_x_2 + 1519] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + cse_var_3: T.int32 = rc_outer_inner * 24 + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 192] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 384] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 576] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 195] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 387] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 579] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 6] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 198] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 390] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 582] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 9] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 201] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 393] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 585] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 12] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 204] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 396] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 588] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 15] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 207] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 399] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 591] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 18] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 210] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 402] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 594] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 21] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 213] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 405] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 597] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 768] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 960] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1152] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1344] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 771] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 963] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1155] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1347] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 774] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 966] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1158] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1350] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 777] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 969] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1161] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1353] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 780] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 972] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1164] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1356] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 783] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 975] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1167] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1359] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 786] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 978] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1170] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1362] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 789] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 981] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1173] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1365] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 193] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 385] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 577] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 4] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 196] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 388] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 580] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 7] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 199] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 391] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 583] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 10] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 202] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 394] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 586] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 13] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 205] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 397] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 589] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 16] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 208] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 400] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 592] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 19] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 211] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 403] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 595] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 22] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 214] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 406] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 598] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 769] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 961] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1153] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1345] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 772] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 964] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1156] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1348] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 775] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 967] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1159] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1351] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 778] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 970] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1162] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1354] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 781] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 973] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1165] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1357] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 784] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 976] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1168] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1360] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 787] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 979] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1171] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1363] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 790] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 982] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1174] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1366] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 2] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 194] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 386] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 578] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 5] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 197] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 389] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 581] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 8] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 200] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 392] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 584] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 11] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 203] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 395] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 587] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 14] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 206] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 398] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 590] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 17] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 209] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 401] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 593] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 20] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 212] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 404] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 596] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 23] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 215] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 407] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 599] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 770] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 962] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1154] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1346] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 773] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 965] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1157] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1349] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 776] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 968] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1160] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1352] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 779] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 971] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1163] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1355] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 782] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 974] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1166] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1358] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 785] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 977] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1169] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1361] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 788] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 980] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1172] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1364] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 791] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 983] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1175] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1367] + for i1_inner in range(8): + compute_1 = T.Buffer((25088,), data=compute.data) + bias_1 = T.Buffer((512,), data=bias.data) + compute_1[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 8 + i1_inner], T.float32(0.0)) +Phase 3 +-------------------- +64 + +8 + +4032 + +1536 + +49 + +T.float32(0.0) + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[0] = T.float32(0.0) + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[1] = T.float32(0.0) + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[2] = T.float32(0.0) + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[3] = T.float32(0.0) + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[4] = T.float32(0.0) + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[5] = T.float32(0.0) + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[6] = T.float32(0.0) + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[7] = T.float32(0.0) + +0 + +8 + +0 + +3 + +49 + +7 + +threadIdx_x + +7 <= threadIdx_x + +1 + +rx_outer_outer + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +rc_outer_outer + +3136 + +rc_outer_outer * 3136 + +rc_outer_outer * 3136 + threadIdx_x + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + +8 + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8 + +data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8] + +T.float32(0.0) + +T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +1 <= (threadIdx_x // 7 + 7) % 9 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +8 + +(threadIdx_x // 7 + 7) % 9 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +49 + +threadIdx_x + 49 + +63 + +(threadIdx_x + 49) // 63 + +49 + +(threadIdx_x + 49) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +7 + +(threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +49 + +threadIdx_x + 49 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +1 <= (threadIdx_x // 7 + 5) % 9 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +8 + +(threadIdx_x // 7 + 5) % 9 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +98 + +threadIdx_x + 98 + +63 + +(threadIdx_x + 98) // 63 + +49 + +(threadIdx_x + 98) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +7 + +(threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +98 + +threadIdx_x + 98 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +1 <= (threadIdx_x // 7 + 3) % 9 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +8 + +(threadIdx_x // 7 + 3) % 9 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +147 + +threadIdx_x + 147 + +63 + +(threadIdx_x + 147) // 63 + +49 + +(threadIdx_x + 147) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +7 + +(threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +147 + +threadIdx_x + 147 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +196 + +threadIdx_x + 196 + +63 + +(threadIdx_x + 196) // 63 + +49 + +(threadIdx_x + 196) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer + +1 + +rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 + +data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] + +T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +196 + +threadIdx_x + 196 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +threadIdx_x = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +1 <= (threadIdx_x // 7 + 8) % 9 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +8 + +(threadIdx_x // 7 + 8) % 9 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +245 + +threadIdx_x + 245 + +63 + +(threadIdx_x + 245) // 63 + +49 + +(threadIdx_x + 245) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +7 + +(threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +245 + +threadIdx_x + 245 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +1 <= (threadIdx_x // 7 + 6) % 9 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +8 + +(threadIdx_x // 7 + 6) % 9 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +294 + +threadIdx_x + 294 + +63 + +(threadIdx_x + 294) // 63 + +49 + +(threadIdx_x + 294) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +7 + +(threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +294 + +threadIdx_x + 294 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +1 <= (threadIdx_x // 7 + 4) % 9 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +8 + +(threadIdx_x // 7 + 4) % 9 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +343 + +threadIdx_x + 343 + +63 + +(threadIdx_x + 343) // 63 + +49 + +(threadIdx_x + 343) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +7 + +(threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +343 + +threadIdx_x + 343 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +42 + +threadIdx_x < 42 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +392 + +threadIdx_x + 392 + +63 + +(threadIdx_x + 392) // 63 + +49 + +(threadIdx_x + 392) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + +6 + +rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 + +data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] + +T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +392 + +threadIdx_x + 392 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +7 + +7 <= threadIdx_x + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +rc_outer_outer * 3136 + threadIdx_x + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + +335 + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335 + +data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335] + +T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) + +441 + +threadIdx_x + 441 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +1 <= (threadIdx_x // 7 + 7) % 9 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +8 + +(threadIdx_x // 7 + 7) % 9 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +490 + +threadIdx_x + 490 + +63 + +(threadIdx_x + 490) // 63 + +49 + +(threadIdx_x + 490) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +7 + +(threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +490 + +threadIdx_x + 490 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +1 <= (threadIdx_x // 7 + 5) % 9 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +8 + +(threadIdx_x // 7 + 5) % 9 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +539 + +threadIdx_x + 539 + +63 + +(threadIdx_x + 539) // 63 + +49 + +(threadIdx_x + 539) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +7 + +(threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +539 + +threadIdx_x + 539 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +1 <= (threadIdx_x // 7 + 3) % 9 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +8 + +(threadIdx_x // 7 + 3) % 9 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +588 + +threadIdx_x + 588 + +63 + +(threadIdx_x + 588) // 63 + +49 + +(threadIdx_x + 588) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +7 + +(threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +588 + +threadIdx_x + 588 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +637 + +threadIdx_x + 637 + +63 + +(threadIdx_x + 637) // 63 + +49 + +(threadIdx_x + 637) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer + +1 + +rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 + +data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] + +T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +637 + +threadIdx_x + 637 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +threadIdx_x = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +1 <= (threadIdx_x // 7 + 8) % 9 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +8 + +(threadIdx_x // 7 + 8) % 9 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +686 + +threadIdx_x + 686 + +63 + +(threadIdx_x + 686) // 63 + +49 + +(threadIdx_x + 686) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +7 + +(threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +686 + +threadIdx_x + 686 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +1 <= (threadIdx_x // 7 + 6) % 9 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +8 + +(threadIdx_x // 7 + 6) % 9 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +735 + +threadIdx_x + 735 + +63 + +(threadIdx_x + 735) // 63 + +49 + +(threadIdx_x + 735) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +7 + +(threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +735 + +threadIdx_x + 735 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +1 <= (threadIdx_x // 7 + 4) % 9 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +8 + +(threadIdx_x // 7 + 4) % 9 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +784 + +threadIdx_x + 784 + +63 + +(threadIdx_x + 784) // 63 + +49 + +(threadIdx_x + 784) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +7 + +(threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +784 + +threadIdx_x + 784 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +42 + +threadIdx_x < 42 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +833 + +threadIdx_x + 833 + +63 + +(threadIdx_x + 833) // 63 + +49 + +(threadIdx_x + 833) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + +6 + +rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 + +data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] + +T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +833 + +threadIdx_x + 833 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +7 + +7 <= threadIdx_x + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +rc_outer_outer * 3136 + threadIdx_x + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + +678 + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678 + +data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678] + +T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) + +882 + +threadIdx_x + 882 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +1 <= (threadIdx_x // 7 + 7) % 9 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +8 + +(threadIdx_x // 7 + 7) % 9 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +931 + +threadIdx_x + 931 + +63 + +(threadIdx_x + 931) // 63 + +49 + +(threadIdx_x + 931) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +7 + +(threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +931 + +threadIdx_x + 931 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +1 <= (threadIdx_x // 7 + 5) % 9 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +8 + +(threadIdx_x // 7 + 5) % 9 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +980 + +threadIdx_x + 980 + +63 + +(threadIdx_x + 980) // 63 + +49 + +(threadIdx_x + 980) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +7 + +(threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +980 + +threadIdx_x + 980 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +1 <= (threadIdx_x // 7 + 3) % 9 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +8 + +(threadIdx_x // 7 + 3) % 9 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1029 + +threadIdx_x + 1029 + +63 + +(threadIdx_x + 1029) // 63 + +49 + +(threadIdx_x + 1029) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +7 + +(threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1029 + +threadIdx_x + 1029 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1078 + +threadIdx_x + 1078 + +63 + +(threadIdx_x + 1078) // 63 + +49 + +(threadIdx_x + 1078) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer + +1 + +rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] + +T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1078 + +threadIdx_x + 1078 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +threadIdx_x = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +1 <= (threadIdx_x // 7 + 8) % 9 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +8 + +(threadIdx_x // 7 + 8) % 9 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1127 + +threadIdx_x + 1127 + +63 + +(threadIdx_x + 1127) // 63 + +49 + +(threadIdx_x + 1127) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +7 + +(threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1127 + +threadIdx_x + 1127 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +1 <= (threadIdx_x // 7 + 6) % 9 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +8 + +(threadIdx_x // 7 + 6) % 9 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1176 + +threadIdx_x + 1176 + +63 + +(threadIdx_x + 1176) // 63 + +49 + +(threadIdx_x + 1176) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +7 + +(threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1176 + +threadIdx_x + 1176 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +1 <= (threadIdx_x // 7 + 4) % 9 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +8 + +(threadIdx_x // 7 + 4) % 9 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1225 + +threadIdx_x + 1225 + +63 + +(threadIdx_x + 1225) // 63 + +49 + +(threadIdx_x + 1225) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +7 + +(threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1225 + +threadIdx_x + 1225 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +42 + +threadIdx_x < 42 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1274 + +threadIdx_x + 1274 + +63 + +(threadIdx_x + 1274) // 63 + +49 + +(threadIdx_x + 1274) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + +6 + +rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] + +T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +1274 + +threadIdx_x + 1274 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +7 + +7 <= threadIdx_x + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +rc_outer_outer * 3136 + threadIdx_x + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + +1021 + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021 + +data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021] + +T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) + +1323 + +threadIdx_x + 1323 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +1 <= (threadIdx_x // 7 + 7) % 9 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +8 + +(threadIdx_x // 7 + 7) % 9 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1372 + +threadIdx_x + 1372 + +63 + +(threadIdx_x + 1372) // 63 + +49 + +(threadIdx_x + 1372) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +7 + +(threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1372 + +threadIdx_x + 1372 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +1 <= (threadIdx_x // 7 + 5) % 9 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +8 + +(threadIdx_x // 7 + 5) % 9 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1421 + +threadIdx_x + 1421 + +63 + +(threadIdx_x + 1421) // 63 + +49 + +(threadIdx_x + 1421) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +7 + +(threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1421 + +threadIdx_x + 1421 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +1 <= (threadIdx_x // 7 + 3) % 9 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +8 + +(threadIdx_x // 7 + 3) % 9 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1470 + +threadIdx_x + 1470 + +63 + +(threadIdx_x + 1470) // 63 + +49 + +(threadIdx_x + 1470) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +7 + +(threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1470 + +threadIdx_x + 1470 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1519 + +threadIdx_x + 1519 + +63 + +(threadIdx_x + 1519) // 63 + +49 + +(threadIdx_x + 1519) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer + +1 + +rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] + +T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1519 + +threadIdx_x + 1519 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +threadIdx_x = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +1 <= (threadIdx_x // 7 + 8) % 9 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +8 + +(threadIdx_x // 7 + 8) % 9 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1568 + +threadIdx_x + 1568 + +63 + +(threadIdx_x + 1568) // 63 + +49 + +(threadIdx_x + 1568) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +7 + +(threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1568 + +threadIdx_x + 1568 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +1 <= (threadIdx_x // 7 + 6) % 9 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +8 + +(threadIdx_x // 7 + 6) % 9 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1617 + +threadIdx_x + 1617 + +63 + +(threadIdx_x + 1617) // 63 + +49 + +(threadIdx_x + 1617) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +7 + +(threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1617 + +threadIdx_x + 1617 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +1 <= (threadIdx_x // 7 + 4) % 9 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +8 + +(threadIdx_x // 7 + 4) % 9 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1666 + +threadIdx_x + 1666 + +63 + +(threadIdx_x + 1666) // 63 + +49 + +(threadIdx_x + 1666) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +7 + +(threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1666 + +threadIdx_x + 1666 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +42 + +threadIdx_x < 42 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1715 + +threadIdx_x + 1715 + +63 + +(threadIdx_x + 1715) // 63 + +49 + +(threadIdx_x + 1715) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + +6 + +rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] + +T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +1715 + +threadIdx_x + 1715 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +7 + +7 <= threadIdx_x + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +rc_outer_outer * 3136 + threadIdx_x + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + +1364 + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364 + +data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364] + +T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) + +1764 + +threadIdx_x + 1764 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +1 <= (threadIdx_x // 7 + 7) % 9 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +8 + +(threadIdx_x // 7 + 7) % 9 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1813 + +threadIdx_x + 1813 + +63 + +(threadIdx_x + 1813) // 63 + +49 + +(threadIdx_x + 1813) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +7 + +(threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1813 + +threadIdx_x + 1813 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +1 <= (threadIdx_x // 7 + 5) % 9 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +8 + +(threadIdx_x // 7 + 5) % 9 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1862 + +threadIdx_x + 1862 + +63 + +(threadIdx_x + 1862) // 63 + +49 + +(threadIdx_x + 1862) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +7 + +(threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1862 + +threadIdx_x + 1862 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +1 <= (threadIdx_x // 7 + 3) % 9 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +8 + +(threadIdx_x // 7 + 3) % 9 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1911 + +threadIdx_x + 1911 + +63 + +(threadIdx_x + 1911) // 63 + +49 + +(threadIdx_x + 1911) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +7 + +(threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1911 + +threadIdx_x + 1911 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +1960 + +threadIdx_x + 1960 + +63 + +(threadIdx_x + 1960) // 63 + +49 + +(threadIdx_x + 1960) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer + +1 + +rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 + +data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] + +T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1960 + +threadIdx_x + 1960 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +threadIdx_x = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +1 <= (threadIdx_x // 7 + 8) % 9 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +8 + +(threadIdx_x // 7 + 8) % 9 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2009 + +threadIdx_x + 2009 + +63 + +(threadIdx_x + 2009) // 63 + +49 + +(threadIdx_x + 2009) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +7 + +(threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2009 + +threadIdx_x + 2009 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +1 <= (threadIdx_x // 7 + 6) % 9 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +8 + +(threadIdx_x // 7 + 6) % 9 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2058 + +threadIdx_x + 2058 + +63 + +(threadIdx_x + 2058) // 63 + +49 + +(threadIdx_x + 2058) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +7 + +(threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2058 + +threadIdx_x + 2058 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +1 <= (threadIdx_x // 7 + 4) % 9 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +8 + +(threadIdx_x // 7 + 4) % 9 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2107 + +threadIdx_x + 2107 + +63 + +(threadIdx_x + 2107) // 63 + +49 + +(threadIdx_x + 2107) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +7 + +(threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2107 + +threadIdx_x + 2107 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +42 + +threadIdx_x < 42 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2156 + +threadIdx_x + 2156 + +63 + +(threadIdx_x + 2156) // 63 + +49 + +(threadIdx_x + 2156) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + +6 + +rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] + +T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +2156 + +threadIdx_x + 2156 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +7 + +7 <= threadIdx_x + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +rc_outer_outer * 3136 + threadIdx_x + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + +1707 + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707 + +data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707] + +T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) + +2205 + +threadIdx_x + 2205 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +1 <= (threadIdx_x // 7 + 7) % 9 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +8 + +(threadIdx_x // 7 + 7) % 9 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2254 + +threadIdx_x + 2254 + +63 + +(threadIdx_x + 2254) // 63 + +49 + +(threadIdx_x + 2254) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +7 + +(threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2254 + +threadIdx_x + 2254 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +1 <= (threadIdx_x // 7 + 5) % 9 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +8 + +(threadIdx_x // 7 + 5) % 9 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2303 + +threadIdx_x + 2303 + +63 + +(threadIdx_x + 2303) // 63 + +49 + +(threadIdx_x + 2303) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +7 + +(threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2303 + +threadIdx_x + 2303 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +1 <= (threadIdx_x // 7 + 3) % 9 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +8 + +(threadIdx_x // 7 + 3) % 9 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2352 + +threadIdx_x + 2352 + +63 + +(threadIdx_x + 2352) // 63 + +49 + +(threadIdx_x + 2352) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +7 + +(threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2352 + +threadIdx_x + 2352 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2401 + +threadIdx_x + 2401 + +63 + +(threadIdx_x + 2401) // 63 + +49 + +(threadIdx_x + 2401) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer + +1 + +rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] + +T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +2401 + +threadIdx_x + 2401 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +threadIdx_x = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +1 <= (threadIdx_x // 7 + 8) % 9 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +8 + +(threadIdx_x // 7 + 8) % 9 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2450 + +threadIdx_x + 2450 + +63 + +(threadIdx_x + 2450) // 63 + +49 + +(threadIdx_x + 2450) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +7 + +(threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2450 + +threadIdx_x + 2450 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +1 <= (threadIdx_x // 7 + 6) % 9 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +8 + +(threadIdx_x // 7 + 6) % 9 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2499 + +threadIdx_x + 2499 + +63 + +(threadIdx_x + 2499) // 63 + +49 + +(threadIdx_x + 2499) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +7 + +(threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2499 + +threadIdx_x + 2499 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +1 <= (threadIdx_x // 7 + 4) % 9 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +8 + +(threadIdx_x // 7 + 4) % 9 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2548 + +threadIdx_x + 2548 + +63 + +(threadIdx_x + 2548) // 63 + +49 + +(threadIdx_x + 2548) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +7 + +(threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2548 + +threadIdx_x + 2548 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +42 + +threadIdx_x < 42 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2597 + +threadIdx_x + 2597 + +63 + +(threadIdx_x + 2597) // 63 + +49 + +(threadIdx_x + 2597) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + +6 + +rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] + +T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +2597 + +threadIdx_x + 2597 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +7 + +7 <= threadIdx_x + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +rc_outer_outer * 3136 + threadIdx_x + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + +2050 + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050 + +data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050] + +T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) + +2646 + +threadIdx_x + 2646 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +1 <= (threadIdx_x // 7 + 7) % 9 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +8 + +(threadIdx_x // 7 + 7) % 9 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2695 + +threadIdx_x + 2695 + +63 + +(threadIdx_x + 2695) // 63 + +49 + +(threadIdx_x + 2695) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +7 + +(threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2695 + +threadIdx_x + 2695 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +1 <= (threadIdx_x // 7 + 5) % 9 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +8 + +(threadIdx_x // 7 + 5) % 9 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2744 + +threadIdx_x + 2744 + +63 + +(threadIdx_x + 2744) // 63 + +49 + +(threadIdx_x + 2744) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +7 + +(threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2744 + +threadIdx_x + 2744 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +1 <= (threadIdx_x // 7 + 3) % 9 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +8 + +(threadIdx_x // 7 + 3) % 9 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2793 + +threadIdx_x + 2793 + +63 + +(threadIdx_x + 2793) // 63 + +49 + +(threadIdx_x + 2793) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +7 + +(threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2793 + +threadIdx_x + 2793 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2842 + +threadIdx_x + 2842 + +63 + +(threadIdx_x + 2842) // 63 + +49 + +(threadIdx_x + 2842) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer + +1 + +rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] + +T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +2842 + +threadIdx_x + 2842 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +threadIdx_x = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +1 <= (threadIdx_x // 7 + 8) % 9 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +8 + +(threadIdx_x // 7 + 8) % 9 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2891 + +threadIdx_x + 2891 + +63 + +(threadIdx_x + 2891) // 63 + +49 + +(threadIdx_x + 2891) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +7 + +(threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2891 + +threadIdx_x + 2891 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +1 <= (threadIdx_x // 7 + 6) % 9 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +8 + +(threadIdx_x // 7 + 6) % 9 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2940 + +threadIdx_x + 2940 + +63 + +(threadIdx_x + 2940) // 63 + +49 + +(threadIdx_x + 2940) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +7 + +(threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2940 + +threadIdx_x + 2940 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +1 <= (threadIdx_x // 7 + 4) % 9 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +8 + +(threadIdx_x // 7 + 4) % 9 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +2989 + +threadIdx_x + 2989 + +63 + +(threadIdx_x + 2989) // 63 + +49 + +(threadIdx_x + 2989) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +7 + +(threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +2989 + +threadIdx_x + 2989 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +42 + +threadIdx_x < 42 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3038 + +threadIdx_x + 3038 + +63 + +(threadIdx_x + 3038) // 63 + +49 + +(threadIdx_x + 3038) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + +6 + +rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] + +T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +3038 + +threadIdx_x + 3038 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +7 + +7 <= threadIdx_x + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +rc_outer_outer * 3136 + threadIdx_x + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + +2393 + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393 + +data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393] + +T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) + +3087 + +threadIdx_x + 3087 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +1 <= (threadIdx_x // 7 + 7) % 9 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +8 + +(threadIdx_x // 7 + 7) % 9 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3136 + +threadIdx_x + 3136 + +63 + +(threadIdx_x + 3136) // 63 + +49 + +(threadIdx_x + 3136) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +7 + +(threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3136 + +threadIdx_x + 3136 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +1 <= (threadIdx_x // 7 + 5) % 9 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +8 + +(threadIdx_x // 7 + 5) % 9 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3185 + +threadIdx_x + 3185 + +63 + +(threadIdx_x + 3185) // 63 + +49 + +(threadIdx_x + 3185) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +7 + +(threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3185 + +threadIdx_x + 3185 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +1 <= (threadIdx_x // 7 + 3) % 9 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +8 + +(threadIdx_x // 7 + 3) % 9 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3234 + +threadIdx_x + 3234 + +63 + +(threadIdx_x + 3234) // 63 + +49 + +(threadIdx_x + 3234) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +7 + +(threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3234 + +threadIdx_x + 3234 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3283 + +threadIdx_x + 3283 + +63 + +(threadIdx_x + 3283) // 63 + +49 + +(threadIdx_x + 3283) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer + +1 + +rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] + +T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +3283 + +threadIdx_x + 3283 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +threadIdx_x = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +1 <= (threadIdx_x // 7 + 8) % 9 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +8 + +(threadIdx_x // 7 + 8) % 9 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3332 + +threadIdx_x + 3332 + +63 + +(threadIdx_x + 3332) // 63 + +49 + +(threadIdx_x + 3332) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +7 + +(threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3332 + +threadIdx_x + 3332 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +1 <= (threadIdx_x // 7 + 6) % 9 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +8 + +(threadIdx_x // 7 + 6) % 9 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3381 + +threadIdx_x + 3381 + +63 + +(threadIdx_x + 3381) // 63 + +49 + +(threadIdx_x + 3381) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +7 + +(threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3381 + +threadIdx_x + 3381 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +1 <= (threadIdx_x // 7 + 4) % 9 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +8 + +(threadIdx_x // 7 + 4) % 9 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3430 + +threadIdx_x + 3430 + +63 + +(threadIdx_x + 3430) // 63 + +49 + +(threadIdx_x + 3430) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +7 + +(threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3430 + +threadIdx_x + 3430 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +42 + +threadIdx_x < 42 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3479 + +threadIdx_x + 3479 + +63 + +(threadIdx_x + 3479) // 63 + +49 + +(threadIdx_x + 3479) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + +6 + +rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] + +T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +3479 + +threadIdx_x + 3479 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +7 + +7 <= threadIdx_x + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +rc_outer_outer * 3136 + threadIdx_x + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + +2736 + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736 + +data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736] + +T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) + +3528 + +threadIdx_x + 3528 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +1 <= (threadIdx_x // 7 + 7) % 9 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +8 + +(threadIdx_x // 7 + 7) % 9 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3577 + +threadIdx_x + 3577 + +63 + +(threadIdx_x + 3577) // 63 + +49 + +(threadIdx_x + 3577) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + +7 + +threadIdx_x // 7 + +7 + +threadIdx_x // 7 + 7 + +9 + +(threadIdx_x // 7 + 7) % 9 + +7 + +(threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3577 + +threadIdx_x + 3577 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +1 <= (threadIdx_x // 7 + 5) % 9 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +8 + +(threadIdx_x // 7 + 5) % 9 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3626 + +threadIdx_x + 3626 + +63 + +(threadIdx_x + 3626) // 63 + +49 + +(threadIdx_x + 3626) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + +7 + +threadIdx_x // 7 + +5 + +threadIdx_x // 7 + 5 + +9 + +(threadIdx_x // 7 + 5) % 9 + +7 + +(threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3626 + +threadIdx_x + 3626 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +1 <= (threadIdx_x // 7 + 3) % 9 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +8 + +(threadIdx_x // 7 + 3) % 9 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3675 + +threadIdx_x + 3675 + +63 + +(threadIdx_x + 3675) // 63 + +49 + +(threadIdx_x + 3675) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + +7 + +threadIdx_x // 7 + +3 + +threadIdx_x // 7 + 3 + +9 + +(threadIdx_x // 7 + 3) % 9 + +7 + +(threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3675 + +threadIdx_x + 3675 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3724 + +threadIdx_x + 3724 + +63 + +(threadIdx_x + 3724) // 63 + +49 + +(threadIdx_x + 3724) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer + +1 + +rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] + +T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +3724 + +threadIdx_x + 3724 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +threadIdx_x = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +1 <= (threadIdx_x // 7 + 8) % 9 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +8 + +(threadIdx_x // 7 + 8) % 9 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3773 + +threadIdx_x + 3773 + +63 + +(threadIdx_x + 3773) // 63 + +49 + +(threadIdx_x + 3773) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + +7 + +threadIdx_x // 7 + +8 + +threadIdx_x // 7 + 8 + +9 + +(threadIdx_x // 7 + 8) % 9 + +7 + +(threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3773 + +threadIdx_x + 3773 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +1 <= (threadIdx_x // 7 + 6) % 9 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +8 + +(threadIdx_x // 7 + 6) % 9 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3822 + +threadIdx_x + 3822 + +63 + +(threadIdx_x + 3822) // 63 + +49 + +(threadIdx_x + 3822) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + +7 + +threadIdx_x // 7 + +6 + +threadIdx_x // 7 + 6 + +9 + +(threadIdx_x // 7 + 6) % 9 + +7 + +(threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3822 + +threadIdx_x + 3822 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +1 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +1 <= (threadIdx_x // 7 + 4) % 9 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +8 + +(threadIdx_x // 7 + 4) % 9 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3871 + +threadIdx_x + 3871 + +63 + +(threadIdx_x + 3871) // 63 + +49 + +(threadIdx_x + 3871) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + +7 + +threadIdx_x // 7 + +4 + +threadIdx_x // 7 + 4 + +9 + +(threadIdx_x // 7 + 4) % 9 + +7 + +(threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + +7 + +threadIdx_x % 7 + +rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 + +8 + +rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] + +T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +3871 + +threadIdx_x + 3871 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + +42 + +threadIdx_x < 42 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +3920 + +threadIdx_x + 3920 + +63 + +(threadIdx_x + 3920) // 63 + +49 + +(threadIdx_x + 3920) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + +rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + +6 + +rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 + +data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] + +T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +3920 + +threadIdx_x + 3920 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + +7 + +7 <= threadIdx_x + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +rc_outer_outer * 3136 + threadIdx_x + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + +3079 + +rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079 + +data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079] + +T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) + +3969 + +threadIdx_x + 3969 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) + +14 + +threadIdx_x < 14 + +7 + +threadIdx_x < 7 + +1 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +1 <= rx_outer_outer + threadIdx_x % 7 + +threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 + +7 + +threadIdx_x % 7 + +rx_outer_outer + threadIdx_x % 7 + +8 + +rx_outer_outer + threadIdx_x % 7 < 8 + +threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 + +3136 + +rc_outer_outer * 3136 + +4018 + +threadIdx_x + 4018 + +63 + +(threadIdx_x + 4018) // 63 + +49 + +(threadIdx_x + 4018) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + +rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + +rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + +41 + +rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41 + +data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41] + +T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) + +4018 + +threadIdx_x + 4018 + +pad_temp_shared = T.Buffer((4032,), scope="shared") +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) + +threadIdx_x = T.int32() +if threadIdx_x < 14: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if threadIdx_x < 14: + pad_temp_shared = T.Buffer((4032,), scope="shared") + rx_outer_outer = T.int32() + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) + +49 + +blockIdx_x + +36864 + +blockIdx_x * 36864 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + rc_outer_outer * 576 + +threadIdx_x + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer] + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +rc_outer_outer = T.int32() +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer] + +36864 + +blockIdx_x * 36864 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +147 + +blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 147 + +kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 147] + +49 + +threadIdx_x + 49 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +rc_outer_outer = T.int32() +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 147] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 147] + +36864 + +blockIdx_x * 36864 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +294 + +blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 294 + +kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 294] + +98 + +threadIdx_x + 98 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +rc_outer_outer = T.int32() +threadIdx_x = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 294] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 294] + +36864 + +blockIdx_x * 36864 + +147 + +threadIdx_x + 147 + +192 + +(threadIdx_x + 147) // 192 + +4608 + +(threadIdx_x + 147) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + +147 + +threadIdx_x + 147 + +192 + +(threadIdx_x + 147) % 192 + +3 + +(threadIdx_x + 147) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 147) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 147) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 147) % 192 * 3 + rx_outer_outer] + +147 + +threadIdx_x + 147 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 147) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 147) % 192 * 3 + rx_outer_outer] + +36864 + +blockIdx_x * 36864 + +196 + +threadIdx_x + 196 + +192 + +(threadIdx_x + 196) // 192 + +4608 + +(threadIdx_x + 196) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +12 + +blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 12 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 12] + +196 + +threadIdx_x + 196 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 12] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 12] + +36864 + +blockIdx_x * 36864 + +245 + +threadIdx_x + 245 + +192 + +(threadIdx_x + 245) // 192 + +4608 + +(threadIdx_x + 245) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +159 + +blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 159 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 159] + +245 + +threadIdx_x + 245 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 159] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 159] + +36864 + +blockIdx_x * 36864 + +294 + +threadIdx_x + 294 + +192 + +(threadIdx_x + 294) // 192 + +4608 + +(threadIdx_x + 294) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +306 + +blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 306 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 306] + +294 + +threadIdx_x + 294 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 306] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 306] + +36864 + +blockIdx_x * 36864 + +343 + +threadIdx_x + 343 + +192 + +(threadIdx_x + 343) // 192 + +4608 + +(threadIdx_x + 343) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + +151 + +threadIdx_x + 151 + +192 + +(threadIdx_x + 151) % 192 + +3 + +(threadIdx_x + 151) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 151) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 151) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 151) % 192 * 3 + rx_outer_outer] + +343 + +threadIdx_x + 343 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 151) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 151) % 192 * 3 + rx_outer_outer] + +36864 + +blockIdx_x * 36864 + +392 + +threadIdx_x + 392 + +192 + +(threadIdx_x + 392) // 192 + +4608 + +(threadIdx_x + 392) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +24 + +blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 24 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 24] + +392 + +threadIdx_x + 392 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 24] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 24] + +36864 + +blockIdx_x * 36864 + +441 + +threadIdx_x + 441 + +192 + +(threadIdx_x + 441) // 192 + +4608 + +(threadIdx_x + 441) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +171 + +blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 171 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 171] + +441 + +threadIdx_x + 441 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 171] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 171] + +36864 + +blockIdx_x * 36864 + +490 + +threadIdx_x + 490 + +192 + +(threadIdx_x + 490) // 192 + +4608 + +(threadIdx_x + 490) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +318 + +blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 318 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 318] + +490 + +threadIdx_x + 490 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 318] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 318] + +36864 + +blockIdx_x * 36864 + +539 + +threadIdx_x + 539 + +192 + +(threadIdx_x + 539) // 192 + +4608 + +(threadIdx_x + 539) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + +155 + +threadIdx_x + 155 + +192 + +(threadIdx_x + 155) % 192 + +3 + +(threadIdx_x + 155) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 155) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 155) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 155) % 192 * 3 + rx_outer_outer] + +539 + +threadIdx_x + 539 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 155) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 155) % 192 * 3 + rx_outer_outer] + +36864 + +blockIdx_x * 36864 + +588 + +threadIdx_x + 588 + +192 + +(threadIdx_x + 588) // 192 + +4608 + +(threadIdx_x + 588) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +36 + +blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 36 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 36] + +588 + +threadIdx_x + 588 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 36] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 36] + +36864 + +blockIdx_x * 36864 + +637 + +threadIdx_x + 637 + +192 + +(threadIdx_x + 637) // 192 + +4608 + +(threadIdx_x + 637) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +183 + +blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 183 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 183] + +637 + +threadIdx_x + 637 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 183] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 183] + +36864 + +blockIdx_x * 36864 + +686 + +threadIdx_x + 686 + +192 + +(threadIdx_x + 686) // 192 + +4608 + +(threadIdx_x + 686) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +330 + +blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 330 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 330] + +686 + +threadIdx_x + 686 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 330] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 330] + +36864 + +blockIdx_x * 36864 + +735 + +threadIdx_x + 735 + +192 + +(threadIdx_x + 735) // 192 + +4608 + +(threadIdx_x + 735) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + +159 + +threadIdx_x + 159 + +192 + +(threadIdx_x + 159) % 192 + +3 + +(threadIdx_x + 159) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 159) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 159) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 159) % 192 * 3 + rx_outer_outer] + +735 + +threadIdx_x + 735 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 159) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 159) % 192 * 3 + rx_outer_outer] + +36864 + +blockIdx_x * 36864 + +784 + +threadIdx_x + 784 + +192 + +(threadIdx_x + 784) // 192 + +4608 + +(threadIdx_x + 784) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +48 + +blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 48 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 48] + +784 + +threadIdx_x + 784 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 48] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 48] + +36864 + +blockIdx_x * 36864 + +833 + +threadIdx_x + 833 + +192 + +(threadIdx_x + 833) // 192 + +4608 + +(threadIdx_x + 833) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +195 + +blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 195 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 195] + +833 + +threadIdx_x + 833 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 195] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 195] + +36864 + +blockIdx_x * 36864 + +882 + +threadIdx_x + 882 + +192 + +(threadIdx_x + 882) // 192 + +4608 + +(threadIdx_x + 882) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +342 + +blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 342 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 342] + +882 + +threadIdx_x + 882 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 342] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 342] + +36864 + +blockIdx_x * 36864 + +931 + +threadIdx_x + 931 + +192 + +(threadIdx_x + 931) // 192 + +4608 + +(threadIdx_x + 931) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + +163 + +threadIdx_x + 163 + +192 + +(threadIdx_x + 163) % 192 + +3 + +(threadIdx_x + 163) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 163) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 163) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 163) % 192 * 3 + rx_outer_outer] + +931 + +threadIdx_x + 931 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 163) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 163) % 192 * 3 + rx_outer_outer] + +36864 + +blockIdx_x * 36864 + +980 + +threadIdx_x + 980 + +192 + +(threadIdx_x + 980) // 192 + +4608 + +(threadIdx_x + 980) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +60 + +blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 60 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 60] + +980 + +threadIdx_x + 980 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 60] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 60] + +36864 + +blockIdx_x * 36864 + +1029 + +threadIdx_x + 1029 + +192 + +(threadIdx_x + 1029) // 192 + +4608 + +(threadIdx_x + 1029) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +207 + +blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 207 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 207] + +1029 + +threadIdx_x + 1029 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 207] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 207] + +36864 + +blockIdx_x * 36864 + +1078 + +threadIdx_x + 1078 + +192 + +(threadIdx_x + 1078) // 192 + +4608 + +(threadIdx_x + 1078) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +354 + +blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 354 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 354] + +1078 + +threadIdx_x + 1078 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 354] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 354] + +36864 + +blockIdx_x * 36864 + +1127 + +threadIdx_x + 1127 + +192 + +(threadIdx_x + 1127) // 192 + +4608 + +(threadIdx_x + 1127) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + +167 + +threadIdx_x + 167 + +192 + +(threadIdx_x + 167) % 192 + +3 + +(threadIdx_x + 167) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 167) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 167) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 167) % 192 * 3 + rx_outer_outer] + +1127 + +threadIdx_x + 1127 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 167) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 167) % 192 * 3 + rx_outer_outer] + +36864 + +blockIdx_x * 36864 + +1176 + +threadIdx_x + 1176 + +192 + +(threadIdx_x + 1176) // 192 + +4608 + +(threadIdx_x + 1176) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +72 + +blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 72 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 72] + +1176 + +threadIdx_x + 1176 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 72] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 72] + +36864 + +blockIdx_x * 36864 + +1225 + +threadIdx_x + 1225 + +192 + +(threadIdx_x + 1225) // 192 + +4608 + +(threadIdx_x + 1225) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +219 + +blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 219 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 219] + +1225 + +threadIdx_x + 1225 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 219] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 219] + +36864 + +blockIdx_x * 36864 + +1274 + +threadIdx_x + 1274 + +192 + +(threadIdx_x + 1274) // 192 + +4608 + +(threadIdx_x + 1274) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +366 + +blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 366 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 366] + +1274 + +threadIdx_x + 1274 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 366] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 366] + +36864 + +blockIdx_x * 36864 + +1323 + +threadIdx_x + 1323 + +192 + +(threadIdx_x + 1323) // 192 + +4608 + +(threadIdx_x + 1323) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + +171 + +threadIdx_x + 171 + +192 + +(threadIdx_x + 171) % 192 + +3 + +(threadIdx_x + 171) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 171) % 192 * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 171) % 192 * 3 + rx_outer_outer + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 171) % 192 * 3 + rx_outer_outer] + +1323 + +threadIdx_x + 1323 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 171) % 192 * 3 + rx_outer_outer] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 171) % 192 * 3 + rx_outer_outer] + +36864 + +blockIdx_x * 36864 + +1372 + +threadIdx_x + 1372 + +192 + +(threadIdx_x + 1372) // 192 + +4608 + +(threadIdx_x + 1372) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +84 + +blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 84 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 84] + +1372 + +threadIdx_x + 1372 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 84] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 84] + +36864 + +blockIdx_x * 36864 + +1421 + +threadIdx_x + 1421 + +192 + +(threadIdx_x + 1421) // 192 + +4608 + +(threadIdx_x + 1421) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +231 + +blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 231 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 231] + +1421 + +threadIdx_x + 1421 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 231] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 231] + +36864 + +blockIdx_x * 36864 + +1470 + +threadIdx_x + 1470 + +192 + +(threadIdx_x + 1470) // 192 + +4608 + +(threadIdx_x + 1470) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +378 + +blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 378 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 378] + +1470 + +threadIdx_x + 1470 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 378] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 378] + +17 + +threadIdx_x < 17 + +36864 + +blockIdx_x * 36864 + +1519 + +threadIdx_x + 1519 + +192 + +(threadIdx_x + 1519) // 192 + +4608 + +(threadIdx_x + 1519) // 192 * 4608 + +blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + +576 + +rc_outer_outer * 576 + +blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + +3 + +threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + +blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + +525 + +blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 525 + +kernel[blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 525] + +1519 + +threadIdx_x + 1519 + +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +rc_outer_outer = T.int32() +rx_outer_outer = T.int32() +kernel_shared[threadIdx_x + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 525] + +threadIdx_x = T.int32() +if threadIdx_x < 17: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 525] + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + if threadIdx_x < 17: + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + rc_outer_outer = T.int32() + rx_outer_outer = T.int32() + kernel_shared[threadIdx_x + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 525] + +0 + +8 + +0 + +conv2d_nchw[0] + +rc_outer_inner + +504 + +rc_outer_inner * 504 + +threadIdx_x + +rc_outer_inner * 504 + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] + +24 + +rc_outer_inner * 24 + +kernel_shared[rc_outer_inner * 24] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] + +24 + +rc_outer_inner * 24 + +192 + +rc_outer_inner * 24 + 192 + +kernel_shared[rc_outer_inner * 24 + 192] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] + +24 + +rc_outer_inner * 24 + +384 + +rc_outer_inner * 24 + 384 + +kernel_shared[rc_outer_inner * 24 + 384] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] + +24 + +rc_outer_inner * 24 + +576 + +rc_outer_inner * 24 + 576 + +kernel_shared[rc_outer_inner * 24 + 576] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +63 + +rc_outer_inner * 504 + threadIdx_x + 63 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] + +24 + +rc_outer_inner * 24 + +3 + +rc_outer_inner * 24 + 3 + +kernel_shared[rc_outer_inner * 24 + 3] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +63 + +rc_outer_inner * 504 + threadIdx_x + 63 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] + +24 + +rc_outer_inner * 24 + +195 + +rc_outer_inner * 24 + 195 + +kernel_shared[rc_outer_inner * 24 + 195] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +63 + +rc_outer_inner * 504 + threadIdx_x + 63 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] + +24 + +rc_outer_inner * 24 + +387 + +rc_outer_inner * 24 + 387 + +kernel_shared[rc_outer_inner * 24 + 387] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +63 + +rc_outer_inner * 504 + threadIdx_x + 63 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] + +24 + +rc_outer_inner * 24 + +579 + +rc_outer_inner * 24 + 579 + +kernel_shared[rc_outer_inner * 24 + 579] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +126 + +rc_outer_inner * 504 + threadIdx_x + 126 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] + +24 + +rc_outer_inner * 24 + +6 + +rc_outer_inner * 24 + 6 + +kernel_shared[rc_outer_inner * 24 + 6] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +126 + +rc_outer_inner * 504 + threadIdx_x + 126 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] + +24 + +rc_outer_inner * 24 + +198 + +rc_outer_inner * 24 + 198 + +kernel_shared[rc_outer_inner * 24 + 198] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +126 + +rc_outer_inner * 504 + threadIdx_x + 126 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] + +24 + +rc_outer_inner * 24 + +390 + +rc_outer_inner * 24 + 390 + +kernel_shared[rc_outer_inner * 24 + 390] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +126 + +rc_outer_inner * 504 + threadIdx_x + 126 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] + +24 + +rc_outer_inner * 24 + +582 + +rc_outer_inner * 24 + 582 + +kernel_shared[rc_outer_inner * 24 + 582] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +189 + +rc_outer_inner * 504 + threadIdx_x + 189 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] + +24 + +rc_outer_inner * 24 + +9 + +rc_outer_inner * 24 + 9 + +kernel_shared[rc_outer_inner * 24 + 9] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +189 + +rc_outer_inner * 504 + threadIdx_x + 189 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] + +24 + +rc_outer_inner * 24 + +201 + +rc_outer_inner * 24 + 201 + +kernel_shared[rc_outer_inner * 24 + 201] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +189 + +rc_outer_inner * 504 + threadIdx_x + 189 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] + +24 + +rc_outer_inner * 24 + +393 + +rc_outer_inner * 24 + 393 + +kernel_shared[rc_outer_inner * 24 + 393] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +189 + +rc_outer_inner * 504 + threadIdx_x + 189 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] + +24 + +rc_outer_inner * 24 + +585 + +rc_outer_inner * 24 + 585 + +kernel_shared[rc_outer_inner * 24 + 585] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +252 + +rc_outer_inner * 504 + threadIdx_x + 252 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] + +24 + +rc_outer_inner * 24 + +12 + +rc_outer_inner * 24 + 12 + +kernel_shared[rc_outer_inner * 24 + 12] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +252 + +rc_outer_inner * 504 + threadIdx_x + 252 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] + +24 + +rc_outer_inner * 24 + +204 + +rc_outer_inner * 24 + 204 + +kernel_shared[rc_outer_inner * 24 + 204] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +252 + +rc_outer_inner * 504 + threadIdx_x + 252 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] + +24 + +rc_outer_inner * 24 + +396 + +rc_outer_inner * 24 + 396 + +kernel_shared[rc_outer_inner * 24 + 396] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +252 + +rc_outer_inner * 504 + threadIdx_x + 252 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] + +24 + +rc_outer_inner * 24 + +588 + +rc_outer_inner * 24 + 588 + +kernel_shared[rc_outer_inner * 24 + 588] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +315 + +rc_outer_inner * 504 + threadIdx_x + 315 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] + +24 + +rc_outer_inner * 24 + +15 + +rc_outer_inner * 24 + 15 + +kernel_shared[rc_outer_inner * 24 + 15] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +315 + +rc_outer_inner * 504 + threadIdx_x + 315 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] + +24 + +rc_outer_inner * 24 + +207 + +rc_outer_inner * 24 + 207 + +kernel_shared[rc_outer_inner * 24 + 207] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +315 + +rc_outer_inner * 504 + threadIdx_x + 315 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] + +24 + +rc_outer_inner * 24 + +399 + +rc_outer_inner * 24 + 399 + +kernel_shared[rc_outer_inner * 24 + 399] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +315 + +rc_outer_inner * 504 + threadIdx_x + 315 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] + +24 + +rc_outer_inner * 24 + +591 + +rc_outer_inner * 24 + 591 + +kernel_shared[rc_outer_inner * 24 + 591] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +378 + +rc_outer_inner * 504 + threadIdx_x + 378 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] + +24 + +rc_outer_inner * 24 + +18 + +rc_outer_inner * 24 + 18 + +kernel_shared[rc_outer_inner * 24 + 18] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +378 + +rc_outer_inner * 504 + threadIdx_x + 378 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] + +24 + +rc_outer_inner * 24 + +210 + +rc_outer_inner * 24 + 210 + +kernel_shared[rc_outer_inner * 24 + 210] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +378 + +rc_outer_inner * 504 + threadIdx_x + 378 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] + +24 + +rc_outer_inner * 24 + +402 + +rc_outer_inner * 24 + 402 + +kernel_shared[rc_outer_inner * 24 + 402] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +378 + +rc_outer_inner * 504 + threadIdx_x + 378 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] + +24 + +rc_outer_inner * 24 + +594 + +rc_outer_inner * 24 + 594 + +kernel_shared[rc_outer_inner * 24 + 594] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +441 + +rc_outer_inner * 504 + threadIdx_x + 441 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] + +24 + +rc_outer_inner * 24 + +21 + +rc_outer_inner * 24 + 21 + +kernel_shared[rc_outer_inner * 24 + 21] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +441 + +rc_outer_inner * 504 + threadIdx_x + 441 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] + +24 + +rc_outer_inner * 24 + +213 + +rc_outer_inner * 24 + 213 + +kernel_shared[rc_outer_inner * 24 + 213] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +441 + +rc_outer_inner * 504 + threadIdx_x + 441 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] + +24 + +rc_outer_inner * 24 + +405 + +rc_outer_inner * 24 + 405 + +kernel_shared[rc_outer_inner * 24 + 405] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +441 + +rc_outer_inner * 504 + threadIdx_x + 441 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] + +24 + +rc_outer_inner * 24 + +597 + +rc_outer_inner * 24 + 597 + +kernel_shared[rc_outer_inner * 24 + 597] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] + +24 + +rc_outer_inner * 24 + +768 + +rc_outer_inner * 24 + 768 + +kernel_shared[rc_outer_inner * 24 + 768] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] + +24 + +rc_outer_inner * 24 + +960 + +rc_outer_inner * 24 + 960 + +kernel_shared[rc_outer_inner * 24 + 960] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] + +24 + +rc_outer_inner * 24 + +1152 + +rc_outer_inner * 24 + 1152 + +kernel_shared[rc_outer_inner * 24 + 1152] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] + +24 + +rc_outer_inner * 24 + +1344 + +rc_outer_inner * 24 + 1344 + +kernel_shared[rc_outer_inner * 24 + 1344] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +63 + +rc_outer_inner * 504 + threadIdx_x + 63 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] + +24 + +rc_outer_inner * 24 + +771 + +rc_outer_inner * 24 + 771 + +kernel_shared[rc_outer_inner * 24 + 771] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +63 + +rc_outer_inner * 504 + threadIdx_x + 63 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] + +24 + +rc_outer_inner * 24 + +963 + +rc_outer_inner * 24 + 963 + +kernel_shared[rc_outer_inner * 24 + 963] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +63 + +rc_outer_inner * 504 + threadIdx_x + 63 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] + +24 + +rc_outer_inner * 24 + +1155 + +rc_outer_inner * 24 + 1155 + +kernel_shared[rc_outer_inner * 24 + 1155] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +63 + +rc_outer_inner * 504 + threadIdx_x + 63 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] + +24 + +rc_outer_inner * 24 + +1347 + +rc_outer_inner * 24 + 1347 + +kernel_shared[rc_outer_inner * 24 + 1347] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +126 + +rc_outer_inner * 504 + threadIdx_x + 126 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] + +24 + +rc_outer_inner * 24 + +774 + +rc_outer_inner * 24 + 774 + +kernel_shared[rc_outer_inner * 24 + 774] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +126 + +rc_outer_inner * 504 + threadIdx_x + 126 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] + +24 + +rc_outer_inner * 24 + +966 + +rc_outer_inner * 24 + 966 + +kernel_shared[rc_outer_inner * 24 + 966] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +126 + +rc_outer_inner * 504 + threadIdx_x + 126 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] + +24 + +rc_outer_inner * 24 + +1158 + +rc_outer_inner * 24 + 1158 + +kernel_shared[rc_outer_inner * 24 + 1158] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +126 + +rc_outer_inner * 504 + threadIdx_x + 126 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] + +24 + +rc_outer_inner * 24 + +1350 + +rc_outer_inner * 24 + 1350 + +kernel_shared[rc_outer_inner * 24 + 1350] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +189 + +rc_outer_inner * 504 + threadIdx_x + 189 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] + +24 + +rc_outer_inner * 24 + +777 + +rc_outer_inner * 24 + 777 + +kernel_shared[rc_outer_inner * 24 + 777] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +189 + +rc_outer_inner * 504 + threadIdx_x + 189 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] + +24 + +rc_outer_inner * 24 + +969 + +rc_outer_inner * 24 + 969 + +kernel_shared[rc_outer_inner * 24 + 969] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +189 + +rc_outer_inner * 504 + threadIdx_x + 189 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] + +24 + +rc_outer_inner * 24 + +1161 + +rc_outer_inner * 24 + 1161 + +kernel_shared[rc_outer_inner * 24 + 1161] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +189 + +rc_outer_inner * 504 + threadIdx_x + 189 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] + +24 + +rc_outer_inner * 24 + +1353 + +rc_outer_inner * 24 + 1353 + +kernel_shared[rc_outer_inner * 24 + 1353] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +252 + +rc_outer_inner * 504 + threadIdx_x + 252 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] + +24 + +rc_outer_inner * 24 + +780 + +rc_outer_inner * 24 + 780 + +kernel_shared[rc_outer_inner * 24 + 780] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +252 + +rc_outer_inner * 504 + threadIdx_x + 252 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] + +24 + +rc_outer_inner * 24 + +972 + +rc_outer_inner * 24 + 972 + +kernel_shared[rc_outer_inner * 24 + 972] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +252 + +rc_outer_inner * 504 + threadIdx_x + 252 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] + +24 + +rc_outer_inner * 24 + +1164 + +rc_outer_inner * 24 + 1164 + +kernel_shared[rc_outer_inner * 24 + 1164] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +252 + +rc_outer_inner * 504 + threadIdx_x + 252 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] + +24 + +rc_outer_inner * 24 + +1356 + +rc_outer_inner * 24 + 1356 + +kernel_shared[rc_outer_inner * 24 + 1356] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +315 + +rc_outer_inner * 504 + threadIdx_x + 315 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] + +24 + +rc_outer_inner * 24 + +783 + +rc_outer_inner * 24 + 783 + +kernel_shared[rc_outer_inner * 24 + 783] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +315 + +rc_outer_inner * 504 + threadIdx_x + 315 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] + +24 + +rc_outer_inner * 24 + +975 + +rc_outer_inner * 24 + 975 + +kernel_shared[rc_outer_inner * 24 + 975] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +315 + +rc_outer_inner * 504 + threadIdx_x + 315 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] + +24 + +rc_outer_inner * 24 + +1167 + +rc_outer_inner * 24 + 1167 + +kernel_shared[rc_outer_inner * 24 + 1167] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +315 + +rc_outer_inner * 504 + threadIdx_x + 315 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] + +24 + +rc_outer_inner * 24 + +1359 + +rc_outer_inner * 24 + 1359 + +kernel_shared[rc_outer_inner * 24 + 1359] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +378 + +rc_outer_inner * 504 + threadIdx_x + 378 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] + +24 + +rc_outer_inner * 24 + +786 + +rc_outer_inner * 24 + 786 + +kernel_shared[rc_outer_inner * 24 + 786] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +378 + +rc_outer_inner * 504 + threadIdx_x + 378 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] + +24 + +rc_outer_inner * 24 + +978 + +rc_outer_inner * 24 + 978 + +kernel_shared[rc_outer_inner * 24 + 978] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +378 + +rc_outer_inner * 504 + threadIdx_x + 378 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] + +24 + +rc_outer_inner * 24 + +1170 + +rc_outer_inner * 24 + 1170 + +kernel_shared[rc_outer_inner * 24 + 1170] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +378 + +rc_outer_inner * 504 + threadIdx_x + 378 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] + +24 + +rc_outer_inner * 24 + +1362 + +rc_outer_inner * 24 + 1362 + +kernel_shared[rc_outer_inner * 24 + 1362] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +441 + +rc_outer_inner * 504 + threadIdx_x + 441 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] + +24 + +rc_outer_inner * 24 + +789 + +rc_outer_inner * 24 + 789 + +kernel_shared[rc_outer_inner * 24 + 789] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +441 + +rc_outer_inner * 504 + threadIdx_x + 441 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] + +24 + +rc_outer_inner * 24 + +981 + +rc_outer_inner * 24 + 981 + +kernel_shared[rc_outer_inner * 24 + 981] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +441 + +rc_outer_inner * 504 + threadIdx_x + 441 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] + +24 + +rc_outer_inner * 24 + +1173 + +rc_outer_inner * 24 + 1173 + +kernel_shared[rc_outer_inner * 24 + 1173] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +441 + +rc_outer_inner * 504 + threadIdx_x + 441 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] + +24 + +rc_outer_inner * 24 + +1365 + +rc_outer_inner * 24 + 1365 + +kernel_shared[rc_outer_inner * 24 + 1365] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +7 + +rc_outer_inner * 504 + threadIdx_x + 7 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] + +24 + +rc_outer_inner * 24 + +1 + +rc_outer_inner * 24 + 1 + +kernel_shared[rc_outer_inner * 24 + 1] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +7 + +rc_outer_inner * 504 + threadIdx_x + 7 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] + +24 + +rc_outer_inner * 24 + +193 + +rc_outer_inner * 24 + 193 + +kernel_shared[rc_outer_inner * 24 + 193] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +7 + +rc_outer_inner * 504 + threadIdx_x + 7 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] + +24 + +rc_outer_inner * 24 + +385 + +rc_outer_inner * 24 + 385 + +kernel_shared[rc_outer_inner * 24 + 385] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +7 + +rc_outer_inner * 504 + threadIdx_x + 7 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] + +24 + +rc_outer_inner * 24 + +577 + +rc_outer_inner * 24 + 577 + +kernel_shared[rc_outer_inner * 24 + 577] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +70 + +rc_outer_inner * 504 + threadIdx_x + 70 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] + +24 + +rc_outer_inner * 24 + +4 + +rc_outer_inner * 24 + 4 + +kernel_shared[rc_outer_inner * 24 + 4] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +70 + +rc_outer_inner * 504 + threadIdx_x + 70 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] + +24 + +rc_outer_inner * 24 + +196 + +rc_outer_inner * 24 + 196 + +kernel_shared[rc_outer_inner * 24 + 196] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +70 + +rc_outer_inner * 504 + threadIdx_x + 70 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] + +24 + +rc_outer_inner * 24 + +388 + +rc_outer_inner * 24 + 388 + +kernel_shared[rc_outer_inner * 24 + 388] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +70 + +rc_outer_inner * 504 + threadIdx_x + 70 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] + +24 + +rc_outer_inner * 24 + +580 + +rc_outer_inner * 24 + 580 + +kernel_shared[rc_outer_inner * 24 + 580] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +133 + +rc_outer_inner * 504 + threadIdx_x + 133 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] + +24 + +rc_outer_inner * 24 + +7 + +rc_outer_inner * 24 + 7 + +kernel_shared[rc_outer_inner * 24 + 7] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +133 + +rc_outer_inner * 504 + threadIdx_x + 133 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] + +24 + +rc_outer_inner * 24 + +199 + +rc_outer_inner * 24 + 199 + +kernel_shared[rc_outer_inner * 24 + 199] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +133 + +rc_outer_inner * 504 + threadIdx_x + 133 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] + +24 + +rc_outer_inner * 24 + +391 + +rc_outer_inner * 24 + 391 + +kernel_shared[rc_outer_inner * 24 + 391] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +133 + +rc_outer_inner * 504 + threadIdx_x + 133 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] + +24 + +rc_outer_inner * 24 + +583 + +rc_outer_inner * 24 + 583 + +kernel_shared[rc_outer_inner * 24 + 583] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +196 + +rc_outer_inner * 504 + threadIdx_x + 196 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] + +24 + +rc_outer_inner * 24 + +10 + +rc_outer_inner * 24 + 10 + +kernel_shared[rc_outer_inner * 24 + 10] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +196 + +rc_outer_inner * 504 + threadIdx_x + 196 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] + +24 + +rc_outer_inner * 24 + +202 + +rc_outer_inner * 24 + 202 + +kernel_shared[rc_outer_inner * 24 + 202] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +196 + +rc_outer_inner * 504 + threadIdx_x + 196 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] + +24 + +rc_outer_inner * 24 + +394 + +rc_outer_inner * 24 + 394 + +kernel_shared[rc_outer_inner * 24 + 394] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +196 + +rc_outer_inner * 504 + threadIdx_x + 196 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] + +24 + +rc_outer_inner * 24 + +586 + +rc_outer_inner * 24 + 586 + +kernel_shared[rc_outer_inner * 24 + 586] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +259 + +rc_outer_inner * 504 + threadIdx_x + 259 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] + +24 + +rc_outer_inner * 24 + +13 + +rc_outer_inner * 24 + 13 + +kernel_shared[rc_outer_inner * 24 + 13] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +259 + +rc_outer_inner * 504 + threadIdx_x + 259 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] + +24 + +rc_outer_inner * 24 + +205 + +rc_outer_inner * 24 + 205 + +kernel_shared[rc_outer_inner * 24 + 205] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +259 + +rc_outer_inner * 504 + threadIdx_x + 259 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] + +24 + +rc_outer_inner * 24 + +397 + +rc_outer_inner * 24 + 397 + +kernel_shared[rc_outer_inner * 24 + 397] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +259 + +rc_outer_inner * 504 + threadIdx_x + 259 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] + +24 + +rc_outer_inner * 24 + +589 + +rc_outer_inner * 24 + 589 + +kernel_shared[rc_outer_inner * 24 + 589] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +322 + +rc_outer_inner * 504 + threadIdx_x + 322 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] + +24 + +rc_outer_inner * 24 + +16 + +rc_outer_inner * 24 + 16 + +kernel_shared[rc_outer_inner * 24 + 16] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +322 + +rc_outer_inner * 504 + threadIdx_x + 322 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] + +24 + +rc_outer_inner * 24 + +208 + +rc_outer_inner * 24 + 208 + +kernel_shared[rc_outer_inner * 24 + 208] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +322 + +rc_outer_inner * 504 + threadIdx_x + 322 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] + +24 + +rc_outer_inner * 24 + +400 + +rc_outer_inner * 24 + 400 + +kernel_shared[rc_outer_inner * 24 + 400] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +322 + +rc_outer_inner * 504 + threadIdx_x + 322 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] + +24 + +rc_outer_inner * 24 + +592 + +rc_outer_inner * 24 + 592 + +kernel_shared[rc_outer_inner * 24 + 592] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +385 + +rc_outer_inner * 504 + threadIdx_x + 385 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] + +24 + +rc_outer_inner * 24 + +19 + +rc_outer_inner * 24 + 19 + +kernel_shared[rc_outer_inner * 24 + 19] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +385 + +rc_outer_inner * 504 + threadIdx_x + 385 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] + +24 + +rc_outer_inner * 24 + +211 + +rc_outer_inner * 24 + 211 + +kernel_shared[rc_outer_inner * 24 + 211] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +385 + +rc_outer_inner * 504 + threadIdx_x + 385 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] + +24 + +rc_outer_inner * 24 + +403 + +rc_outer_inner * 24 + 403 + +kernel_shared[rc_outer_inner * 24 + 403] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +385 + +rc_outer_inner * 504 + threadIdx_x + 385 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] + +24 + +rc_outer_inner * 24 + +595 + +rc_outer_inner * 24 + 595 + +kernel_shared[rc_outer_inner * 24 + 595] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +448 + +rc_outer_inner * 504 + threadIdx_x + 448 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] + +24 + +rc_outer_inner * 24 + +22 + +rc_outer_inner * 24 + 22 + +kernel_shared[rc_outer_inner * 24 + 22] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +448 + +rc_outer_inner * 504 + threadIdx_x + 448 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] + +24 + +rc_outer_inner * 24 + +214 + +rc_outer_inner * 24 + 214 + +kernel_shared[rc_outer_inner * 24 + 214] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +448 + +rc_outer_inner * 504 + threadIdx_x + 448 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] + +24 + +rc_outer_inner * 24 + +406 + +rc_outer_inner * 24 + 406 + +kernel_shared[rc_outer_inner * 24 + 406] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +448 + +rc_outer_inner * 504 + threadIdx_x + 448 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] + +24 + +rc_outer_inner * 24 + +598 + +rc_outer_inner * 24 + 598 + +kernel_shared[rc_outer_inner * 24 + 598] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +7 + +rc_outer_inner * 504 + threadIdx_x + 7 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] + +24 + +rc_outer_inner * 24 + +769 + +rc_outer_inner * 24 + 769 + +kernel_shared[rc_outer_inner * 24 + 769] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +7 + +rc_outer_inner * 504 + threadIdx_x + 7 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] + +24 + +rc_outer_inner * 24 + +961 + +rc_outer_inner * 24 + 961 + +kernel_shared[rc_outer_inner * 24 + 961] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +7 + +rc_outer_inner * 504 + threadIdx_x + 7 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] + +24 + +rc_outer_inner * 24 + +1153 + +rc_outer_inner * 24 + 1153 + +kernel_shared[rc_outer_inner * 24 + 1153] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +7 + +rc_outer_inner * 504 + threadIdx_x + 7 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] + +24 + +rc_outer_inner * 24 + +1345 + +rc_outer_inner * 24 + 1345 + +kernel_shared[rc_outer_inner * 24 + 1345] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +70 + +rc_outer_inner * 504 + threadIdx_x + 70 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] + +24 + +rc_outer_inner * 24 + +772 + +rc_outer_inner * 24 + 772 + +kernel_shared[rc_outer_inner * 24 + 772] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +70 + +rc_outer_inner * 504 + threadIdx_x + 70 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] + +24 + +rc_outer_inner * 24 + +964 + +rc_outer_inner * 24 + 964 + +kernel_shared[rc_outer_inner * 24 + 964] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +70 + +rc_outer_inner * 504 + threadIdx_x + 70 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] + +24 + +rc_outer_inner * 24 + +1156 + +rc_outer_inner * 24 + 1156 + +kernel_shared[rc_outer_inner * 24 + 1156] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +70 + +rc_outer_inner * 504 + threadIdx_x + 70 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] + +24 + +rc_outer_inner * 24 + +1348 + +rc_outer_inner * 24 + 1348 + +kernel_shared[rc_outer_inner * 24 + 1348] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +133 + +rc_outer_inner * 504 + threadIdx_x + 133 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] + +24 + +rc_outer_inner * 24 + +775 + +rc_outer_inner * 24 + 775 + +kernel_shared[rc_outer_inner * 24 + 775] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +133 + +rc_outer_inner * 504 + threadIdx_x + 133 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] + +24 + +rc_outer_inner * 24 + +967 + +rc_outer_inner * 24 + 967 + +kernel_shared[rc_outer_inner * 24 + 967] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +133 + +rc_outer_inner * 504 + threadIdx_x + 133 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] + +24 + +rc_outer_inner * 24 + +1159 + +rc_outer_inner * 24 + 1159 + +kernel_shared[rc_outer_inner * 24 + 1159] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +133 + +rc_outer_inner * 504 + threadIdx_x + 133 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] + +24 + +rc_outer_inner * 24 + +1351 + +rc_outer_inner * 24 + 1351 + +kernel_shared[rc_outer_inner * 24 + 1351] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +196 + +rc_outer_inner * 504 + threadIdx_x + 196 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] + +24 + +rc_outer_inner * 24 + +778 + +rc_outer_inner * 24 + 778 + +kernel_shared[rc_outer_inner * 24 + 778] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +196 + +rc_outer_inner * 504 + threadIdx_x + 196 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] + +24 + +rc_outer_inner * 24 + +970 + +rc_outer_inner * 24 + 970 + +kernel_shared[rc_outer_inner * 24 + 970] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +196 + +rc_outer_inner * 504 + threadIdx_x + 196 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] + +24 + +rc_outer_inner * 24 + +1162 + +rc_outer_inner * 24 + 1162 + +kernel_shared[rc_outer_inner * 24 + 1162] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +196 + +rc_outer_inner * 504 + threadIdx_x + 196 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] + +24 + +rc_outer_inner * 24 + +1354 + +rc_outer_inner * 24 + 1354 + +kernel_shared[rc_outer_inner * 24 + 1354] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +259 + +rc_outer_inner * 504 + threadIdx_x + 259 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] + +24 + +rc_outer_inner * 24 + +781 + +rc_outer_inner * 24 + 781 + +kernel_shared[rc_outer_inner * 24 + 781] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +259 + +rc_outer_inner * 504 + threadIdx_x + 259 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] + +24 + +rc_outer_inner * 24 + +973 + +rc_outer_inner * 24 + 973 + +kernel_shared[rc_outer_inner * 24 + 973] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +259 + +rc_outer_inner * 504 + threadIdx_x + 259 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] + +24 + +rc_outer_inner * 24 + +1165 + +rc_outer_inner * 24 + 1165 + +kernel_shared[rc_outer_inner * 24 + 1165] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +259 + +rc_outer_inner * 504 + threadIdx_x + 259 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] + +24 + +rc_outer_inner * 24 + +1357 + +rc_outer_inner * 24 + 1357 + +kernel_shared[rc_outer_inner * 24 + 1357] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +322 + +rc_outer_inner * 504 + threadIdx_x + 322 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] + +24 + +rc_outer_inner * 24 + +784 + +rc_outer_inner * 24 + 784 + +kernel_shared[rc_outer_inner * 24 + 784] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +322 + +rc_outer_inner * 504 + threadIdx_x + 322 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] + +24 + +rc_outer_inner * 24 + +976 + +rc_outer_inner * 24 + 976 + +kernel_shared[rc_outer_inner * 24 + 976] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +322 + +rc_outer_inner * 504 + threadIdx_x + 322 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] + +24 + +rc_outer_inner * 24 + +1168 + +rc_outer_inner * 24 + 1168 + +kernel_shared[rc_outer_inner * 24 + 1168] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +322 + +rc_outer_inner * 504 + threadIdx_x + 322 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] + +24 + +rc_outer_inner * 24 + +1360 + +rc_outer_inner * 24 + 1360 + +kernel_shared[rc_outer_inner * 24 + 1360] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +385 + +rc_outer_inner * 504 + threadIdx_x + 385 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] + +24 + +rc_outer_inner * 24 + +787 + +rc_outer_inner * 24 + 787 + +kernel_shared[rc_outer_inner * 24 + 787] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +385 + +rc_outer_inner * 504 + threadIdx_x + 385 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] + +24 + +rc_outer_inner * 24 + +979 + +rc_outer_inner * 24 + 979 + +kernel_shared[rc_outer_inner * 24 + 979] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +385 + +rc_outer_inner * 504 + threadIdx_x + 385 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] + +24 + +rc_outer_inner * 24 + +1171 + +rc_outer_inner * 24 + 1171 + +kernel_shared[rc_outer_inner * 24 + 1171] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +385 + +rc_outer_inner * 504 + threadIdx_x + 385 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] + +24 + +rc_outer_inner * 24 + +1363 + +rc_outer_inner * 24 + 1363 + +kernel_shared[rc_outer_inner * 24 + 1363] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +448 + +rc_outer_inner * 504 + threadIdx_x + 448 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] + +24 + +rc_outer_inner * 24 + +790 + +rc_outer_inner * 24 + 790 + +kernel_shared[rc_outer_inner * 24 + 790] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +448 + +rc_outer_inner * 504 + threadIdx_x + 448 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] + +24 + +rc_outer_inner * 24 + +982 + +rc_outer_inner * 24 + 982 + +kernel_shared[rc_outer_inner * 24 + 982] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +448 + +rc_outer_inner * 504 + threadIdx_x + 448 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] + +24 + +rc_outer_inner * 24 + +1174 + +rc_outer_inner * 24 + 1174 + +kernel_shared[rc_outer_inner * 24 + 1174] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +448 + +rc_outer_inner * 504 + threadIdx_x + 448 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] + +24 + +rc_outer_inner * 24 + +1366 + +rc_outer_inner * 24 + 1366 + +kernel_shared[rc_outer_inner * 24 + 1366] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +14 + +rc_outer_inner * 504 + threadIdx_x + 14 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] + +24 + +rc_outer_inner * 24 + +2 + +rc_outer_inner * 24 + 2 + +kernel_shared[rc_outer_inner * 24 + 2] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +14 + +rc_outer_inner * 504 + threadIdx_x + 14 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] + +24 + +rc_outer_inner * 24 + +194 + +rc_outer_inner * 24 + 194 + +kernel_shared[rc_outer_inner * 24 + 194] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +14 + +rc_outer_inner * 504 + threadIdx_x + 14 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] + +24 + +rc_outer_inner * 24 + +386 + +rc_outer_inner * 24 + 386 + +kernel_shared[rc_outer_inner * 24 + 386] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +14 + +rc_outer_inner * 504 + threadIdx_x + 14 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] + +24 + +rc_outer_inner * 24 + +578 + +rc_outer_inner * 24 + 578 + +kernel_shared[rc_outer_inner * 24 + 578] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +77 + +rc_outer_inner * 504 + threadIdx_x + 77 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] + +24 + +rc_outer_inner * 24 + +5 + +rc_outer_inner * 24 + 5 + +kernel_shared[rc_outer_inner * 24 + 5] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +77 + +rc_outer_inner * 504 + threadIdx_x + 77 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] + +24 + +rc_outer_inner * 24 + +197 + +rc_outer_inner * 24 + 197 + +kernel_shared[rc_outer_inner * 24 + 197] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +77 + +rc_outer_inner * 504 + threadIdx_x + 77 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] + +24 + +rc_outer_inner * 24 + +389 + +rc_outer_inner * 24 + 389 + +kernel_shared[rc_outer_inner * 24 + 389] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +77 + +rc_outer_inner * 504 + threadIdx_x + 77 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] + +24 + +rc_outer_inner * 24 + +581 + +rc_outer_inner * 24 + 581 + +kernel_shared[rc_outer_inner * 24 + 581] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +140 + +rc_outer_inner * 504 + threadIdx_x + 140 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] + +24 + +rc_outer_inner * 24 + +8 + +rc_outer_inner * 24 + 8 + +kernel_shared[rc_outer_inner * 24 + 8] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +140 + +rc_outer_inner * 504 + threadIdx_x + 140 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] + +24 + +rc_outer_inner * 24 + +200 + +rc_outer_inner * 24 + 200 + +kernel_shared[rc_outer_inner * 24 + 200] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +140 + +rc_outer_inner * 504 + threadIdx_x + 140 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] + +24 + +rc_outer_inner * 24 + +392 + +rc_outer_inner * 24 + 392 + +kernel_shared[rc_outer_inner * 24 + 392] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +140 + +rc_outer_inner * 504 + threadIdx_x + 140 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] + +24 + +rc_outer_inner * 24 + +584 + +rc_outer_inner * 24 + 584 + +kernel_shared[rc_outer_inner * 24 + 584] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +203 + +rc_outer_inner * 504 + threadIdx_x + 203 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] + +24 + +rc_outer_inner * 24 + +11 + +rc_outer_inner * 24 + 11 + +kernel_shared[rc_outer_inner * 24 + 11] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +203 + +rc_outer_inner * 504 + threadIdx_x + 203 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] + +24 + +rc_outer_inner * 24 + +203 + +rc_outer_inner * 24 + 203 + +kernel_shared[rc_outer_inner * 24 + 203] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +203 + +rc_outer_inner * 504 + threadIdx_x + 203 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] + +24 + +rc_outer_inner * 24 + +395 + +rc_outer_inner * 24 + 395 + +kernel_shared[rc_outer_inner * 24 + 395] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +203 + +rc_outer_inner * 504 + threadIdx_x + 203 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] + +24 + +rc_outer_inner * 24 + +587 + +rc_outer_inner * 24 + 587 + +kernel_shared[rc_outer_inner * 24 + 587] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +266 + +rc_outer_inner * 504 + threadIdx_x + 266 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] + +24 + +rc_outer_inner * 24 + +14 + +rc_outer_inner * 24 + 14 + +kernel_shared[rc_outer_inner * 24 + 14] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +266 + +rc_outer_inner * 504 + threadIdx_x + 266 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] + +24 + +rc_outer_inner * 24 + +206 + +rc_outer_inner * 24 + 206 + +kernel_shared[rc_outer_inner * 24 + 206] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +266 + +rc_outer_inner * 504 + threadIdx_x + 266 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] + +24 + +rc_outer_inner * 24 + +398 + +rc_outer_inner * 24 + 398 + +kernel_shared[rc_outer_inner * 24 + 398] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +266 + +rc_outer_inner * 504 + threadIdx_x + 266 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] + +24 + +rc_outer_inner * 24 + +590 + +rc_outer_inner * 24 + 590 + +kernel_shared[rc_outer_inner * 24 + 590] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +329 + +rc_outer_inner * 504 + threadIdx_x + 329 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] + +24 + +rc_outer_inner * 24 + +17 + +rc_outer_inner * 24 + 17 + +kernel_shared[rc_outer_inner * 24 + 17] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +329 + +rc_outer_inner * 504 + threadIdx_x + 329 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] + +24 + +rc_outer_inner * 24 + +209 + +rc_outer_inner * 24 + 209 + +kernel_shared[rc_outer_inner * 24 + 209] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +329 + +rc_outer_inner * 504 + threadIdx_x + 329 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] + +24 + +rc_outer_inner * 24 + +401 + +rc_outer_inner * 24 + 401 + +kernel_shared[rc_outer_inner * 24 + 401] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +329 + +rc_outer_inner * 504 + threadIdx_x + 329 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] + +24 + +rc_outer_inner * 24 + +593 + +rc_outer_inner * 24 + 593 + +kernel_shared[rc_outer_inner * 24 + 593] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +392 + +rc_outer_inner * 504 + threadIdx_x + 392 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] + +24 + +rc_outer_inner * 24 + +20 + +rc_outer_inner * 24 + 20 + +kernel_shared[rc_outer_inner * 24 + 20] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +392 + +rc_outer_inner * 504 + threadIdx_x + 392 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] + +24 + +rc_outer_inner * 24 + +212 + +rc_outer_inner * 24 + 212 + +kernel_shared[rc_outer_inner * 24 + 212] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +392 + +rc_outer_inner * 504 + threadIdx_x + 392 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] + +24 + +rc_outer_inner * 24 + +404 + +rc_outer_inner * 24 + 404 + +kernel_shared[rc_outer_inner * 24 + 404] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +392 + +rc_outer_inner * 504 + threadIdx_x + 392 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] + +24 + +rc_outer_inner * 24 + +596 + +rc_outer_inner * 24 + 596 + +kernel_shared[rc_outer_inner * 24 + 596] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] + +0 + +conv2d_nchw[0] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +455 + +rc_outer_inner * 504 + threadIdx_x + 455 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] + +24 + +rc_outer_inner * 24 + +23 + +rc_outer_inner * 24 + 23 + +kernel_shared[rc_outer_inner * 24 + 23] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] + +conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] + +0 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] + +1 + +conv2d_nchw[1] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +455 + +rc_outer_inner * 504 + threadIdx_x + 455 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] + +24 + +rc_outer_inner * 24 + +215 + +rc_outer_inner * 24 + 215 + +kernel_shared[rc_outer_inner * 24 + 215] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] + +conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] + +1 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] + +2 + +conv2d_nchw[2] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +455 + +rc_outer_inner * 504 + threadIdx_x + 455 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] + +24 + +rc_outer_inner * 24 + +407 + +rc_outer_inner * 24 + 407 + +kernel_shared[rc_outer_inner * 24 + 407] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] + +conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] + +2 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] + +3 + +conv2d_nchw[3] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +455 + +rc_outer_inner * 504 + threadIdx_x + 455 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] + +24 + +rc_outer_inner * 24 + +599 + +rc_outer_inner * 24 + 599 + +kernel_shared[rc_outer_inner * 24 + 599] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] + +conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] + +3 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +14 + +rc_outer_inner * 504 + threadIdx_x + 14 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] + +24 + +rc_outer_inner * 24 + +770 + +rc_outer_inner * 24 + 770 + +kernel_shared[rc_outer_inner * 24 + 770] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +14 + +rc_outer_inner * 504 + threadIdx_x + 14 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] + +24 + +rc_outer_inner * 24 + +962 + +rc_outer_inner * 24 + 962 + +kernel_shared[rc_outer_inner * 24 + 962] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +14 + +rc_outer_inner * 504 + threadIdx_x + 14 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] + +24 + +rc_outer_inner * 24 + +1154 + +rc_outer_inner * 24 + 1154 + +kernel_shared[rc_outer_inner * 24 + 1154] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +14 + +rc_outer_inner * 504 + threadIdx_x + 14 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] + +24 + +rc_outer_inner * 24 + +1346 + +rc_outer_inner * 24 + 1346 + +kernel_shared[rc_outer_inner * 24 + 1346] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +77 + +rc_outer_inner * 504 + threadIdx_x + 77 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] + +24 + +rc_outer_inner * 24 + +773 + +rc_outer_inner * 24 + 773 + +kernel_shared[rc_outer_inner * 24 + 773] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +77 + +rc_outer_inner * 504 + threadIdx_x + 77 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] + +24 + +rc_outer_inner * 24 + +965 + +rc_outer_inner * 24 + 965 + +kernel_shared[rc_outer_inner * 24 + 965] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +77 + +rc_outer_inner * 504 + threadIdx_x + 77 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] + +24 + +rc_outer_inner * 24 + +1157 + +rc_outer_inner * 24 + 1157 + +kernel_shared[rc_outer_inner * 24 + 1157] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +77 + +rc_outer_inner * 504 + threadIdx_x + 77 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] + +24 + +rc_outer_inner * 24 + +1349 + +rc_outer_inner * 24 + 1349 + +kernel_shared[rc_outer_inner * 24 + 1349] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +140 + +rc_outer_inner * 504 + threadIdx_x + 140 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] + +24 + +rc_outer_inner * 24 + +776 + +rc_outer_inner * 24 + 776 + +kernel_shared[rc_outer_inner * 24 + 776] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +140 + +rc_outer_inner * 504 + threadIdx_x + 140 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] + +24 + +rc_outer_inner * 24 + +968 + +rc_outer_inner * 24 + 968 + +kernel_shared[rc_outer_inner * 24 + 968] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +140 + +rc_outer_inner * 504 + threadIdx_x + 140 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] + +24 + +rc_outer_inner * 24 + +1160 + +rc_outer_inner * 24 + 1160 + +kernel_shared[rc_outer_inner * 24 + 1160] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +140 + +rc_outer_inner * 504 + threadIdx_x + 140 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] + +24 + +rc_outer_inner * 24 + +1352 + +rc_outer_inner * 24 + 1352 + +kernel_shared[rc_outer_inner * 24 + 1352] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +203 + +rc_outer_inner * 504 + threadIdx_x + 203 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] + +24 + +rc_outer_inner * 24 + +779 + +rc_outer_inner * 24 + 779 + +kernel_shared[rc_outer_inner * 24 + 779] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +203 + +rc_outer_inner * 504 + threadIdx_x + 203 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] + +24 + +rc_outer_inner * 24 + +971 + +rc_outer_inner * 24 + 971 + +kernel_shared[rc_outer_inner * 24 + 971] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +203 + +rc_outer_inner * 504 + threadIdx_x + 203 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] + +24 + +rc_outer_inner * 24 + +1163 + +rc_outer_inner * 24 + 1163 + +kernel_shared[rc_outer_inner * 24 + 1163] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +203 + +rc_outer_inner * 504 + threadIdx_x + 203 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] + +24 + +rc_outer_inner * 24 + +1355 + +rc_outer_inner * 24 + 1355 + +kernel_shared[rc_outer_inner * 24 + 1355] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +266 + +rc_outer_inner * 504 + threadIdx_x + 266 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] + +24 + +rc_outer_inner * 24 + +782 + +rc_outer_inner * 24 + 782 + +kernel_shared[rc_outer_inner * 24 + 782] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +266 + +rc_outer_inner * 504 + threadIdx_x + 266 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] + +24 + +rc_outer_inner * 24 + +974 + +rc_outer_inner * 24 + 974 + +kernel_shared[rc_outer_inner * 24 + 974] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +266 + +rc_outer_inner * 504 + threadIdx_x + 266 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] + +24 + +rc_outer_inner * 24 + +1166 + +rc_outer_inner * 24 + 1166 + +kernel_shared[rc_outer_inner * 24 + 1166] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +266 + +rc_outer_inner * 504 + threadIdx_x + 266 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] + +24 + +rc_outer_inner * 24 + +1358 + +rc_outer_inner * 24 + 1358 + +kernel_shared[rc_outer_inner * 24 + 1358] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +329 + +rc_outer_inner * 504 + threadIdx_x + 329 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] + +24 + +rc_outer_inner * 24 + +785 + +rc_outer_inner * 24 + 785 + +kernel_shared[rc_outer_inner * 24 + 785] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +329 + +rc_outer_inner * 504 + threadIdx_x + 329 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] + +24 + +rc_outer_inner * 24 + +977 + +rc_outer_inner * 24 + 977 + +kernel_shared[rc_outer_inner * 24 + 977] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +329 + +rc_outer_inner * 504 + threadIdx_x + 329 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] + +24 + +rc_outer_inner * 24 + +1169 + +rc_outer_inner * 24 + 1169 + +kernel_shared[rc_outer_inner * 24 + 1169] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +329 + +rc_outer_inner * 504 + threadIdx_x + 329 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] + +24 + +rc_outer_inner * 24 + +1361 + +rc_outer_inner * 24 + 1361 + +kernel_shared[rc_outer_inner * 24 + 1361] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +392 + +rc_outer_inner * 504 + threadIdx_x + 392 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] + +24 + +rc_outer_inner * 24 + +788 + +rc_outer_inner * 24 + 788 + +kernel_shared[rc_outer_inner * 24 + 788] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +392 + +rc_outer_inner * 504 + threadIdx_x + 392 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] + +24 + +rc_outer_inner * 24 + +980 + +rc_outer_inner * 24 + 980 + +kernel_shared[rc_outer_inner * 24 + 980] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +392 + +rc_outer_inner * 504 + threadIdx_x + 392 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] + +24 + +rc_outer_inner * 24 + +1172 + +rc_outer_inner * 24 + 1172 + +kernel_shared[rc_outer_inner * 24 + 1172] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +392 + +rc_outer_inner * 504 + threadIdx_x + 392 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] + +24 + +rc_outer_inner * 24 + +1364 + +rc_outer_inner * 24 + 1364 + +kernel_shared[rc_outer_inner * 24 + 1364] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] + +4 + +conv2d_nchw[4] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +455 + +rc_outer_inner * 504 + threadIdx_x + 455 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] + +24 + +rc_outer_inner * 24 + +791 + +rc_outer_inner * 24 + 791 + +kernel_shared[rc_outer_inner * 24 + 791] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] + +conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] + +4 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] + +5 + +conv2d_nchw[5] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +455 + +rc_outer_inner * 504 + threadIdx_x + 455 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] + +24 + +rc_outer_inner * 24 + +983 + +rc_outer_inner * 24 + 983 + +kernel_shared[rc_outer_inner * 24 + 983] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] + +conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] + +5 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] + +6 + +conv2d_nchw[6] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +455 + +rc_outer_inner * 504 + threadIdx_x + 455 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] + +24 + +rc_outer_inner * 24 + +1175 + +rc_outer_inner * 24 + 1175 + +kernel_shared[rc_outer_inner * 24 + 1175] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] + +conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] + +6 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] + +7 + +conv2d_nchw[7] + +504 + +rc_outer_inner * 504 + +rc_outer_inner * 504 + threadIdx_x + +455 + +rc_outer_inner * 504 + threadIdx_x + 455 + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] + +24 + +rc_outer_inner * 24 + +1367 + +rc_outer_inner * 24 + 1367 + +kernel_shared[rc_outer_inner * 24 + 1367] + +pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] + +conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] + +7 + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +pad_temp_shared = T.Buffer((4032,), scope="shared") +rc_outer_inner = T.int32() +threadIdx_x = T.int32() +kernel_shared = T.Buffer((1536,), scope="shared") +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] +conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] +conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] +conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] +conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] +conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] +conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] +conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] +conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] + +for rc_outer_inner in range(8): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + pad_temp_shared = T.Buffer((4032,), scope="shared") + threadIdx_x = T.int32() + kernel_shared = T.Buffer((1536,), scope="shared") + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] + +threadIdx_x = T.env_thread("threadIdx.x") +pad_temp_shared = T.Buffer((4032,), scope="shared") +rx_outer_outer = T.int32() +data = T.Buffer((25088,)) +rc_outer_outer = T.int32() +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) +with T.launch_thread(threadIdx_x, 49): + if threadIdx_x < 14: + pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) +threadIdx_x_1 = T.env_thread("threadIdx.x") +kernel_shared = T.Buffer((1536,), scope="shared") +kernel = T.Buffer((2359296,)) +blockIdx_x = T.int32() +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 147] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 294] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 147) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 12] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 159] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 306] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 151) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 24] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 171] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 318] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 155) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 36] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 183] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 330] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 159) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 48] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 195] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 342] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 163) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 60] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 207] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 354] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 167) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 72] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 219] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 366] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 171) % 192 * 3 + rx_outer_outer] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 84] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 231] +with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 378] +with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 17: + kernel_shared[threadIdx_x_1 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 525] +for rc_outer_inner in range(8): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + threadIdx_x_2 = T.int32() + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 192] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 384] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 576] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 3] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 195] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 387] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 579] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 6] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 198] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 390] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 582] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 9] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 201] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 393] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 585] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 12] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 204] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 396] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 588] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 15] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 207] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 399] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 591] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 18] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 210] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 402] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 594] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 21] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 213] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 405] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 597] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 768] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 960] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1152] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1344] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 771] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 963] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1155] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1347] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 774] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 966] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1158] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1350] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 777] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 969] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1161] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1353] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 780] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 972] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1164] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1356] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 783] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 975] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1167] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1359] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 786] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 978] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1170] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1362] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 789] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 981] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1173] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1365] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 193] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 385] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 577] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 4] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 196] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 388] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 580] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 7] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 199] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 391] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 583] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 10] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 202] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 394] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 586] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 13] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 205] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 397] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 589] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 16] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 208] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 400] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 592] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 19] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 211] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 403] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 595] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 22] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 214] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 406] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 598] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 769] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 961] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1153] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1345] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 772] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 964] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1156] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1348] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 775] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 967] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1159] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1351] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 778] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 970] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1162] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1354] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 781] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 973] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1165] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1357] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 784] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 976] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1168] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1360] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 787] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 979] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1171] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1363] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 790] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 982] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1174] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1366] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 2] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 194] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 386] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 578] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 5] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 197] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 389] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 581] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 8] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 200] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 392] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 584] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 11] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 203] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 395] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 587] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 14] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 206] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 398] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 590] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 17] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 209] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 401] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 593] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 20] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 212] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 404] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 596] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 23] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 215] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 407] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 599] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 770] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 962] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1154] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1346] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 773] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 965] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1157] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1349] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 776] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 968] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1160] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1352] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 779] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 971] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1163] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1355] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 782] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 974] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1166] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1358] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 785] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 977] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1169] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1361] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 788] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 980] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1172] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1364] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 791] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 983] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1175] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1367] + +for rx_outer_outer in range(3): + threadIdx_x = T.env_thread("threadIdx.x") + pad_temp_shared = T.Buffer((4032,), scope="shared") + data = T.Buffer((25088,)) + rc_outer_outer = T.int32() + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if threadIdx_x < 14: + pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) + threadIdx_x_1 = T.env_thread("threadIdx.x") + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 17: + kernel_shared[threadIdx_x_1 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + threadIdx_x_2 = T.int32() + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 192] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 384] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 576] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 3] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 195] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 387] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 579] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 6] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 198] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 390] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 582] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 9] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 201] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 393] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 585] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 12] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 204] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 396] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 588] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 15] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 207] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 399] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 591] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 18] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 210] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 402] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 594] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 21] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 213] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 405] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 597] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 768] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 960] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1152] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1344] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 771] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 963] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1155] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1347] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 774] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 966] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1158] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1350] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 777] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 969] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1161] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1353] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 780] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 972] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1164] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1356] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 783] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 975] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1167] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1359] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 786] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 978] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1170] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1362] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 789] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 981] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1173] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1365] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 193] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 385] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 577] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 4] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 196] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 388] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 580] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 7] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 199] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 391] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 583] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 10] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 202] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 394] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 586] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 13] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 205] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 397] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 589] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 16] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 208] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 400] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 592] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 19] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 211] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 403] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 595] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 22] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 214] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 406] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 598] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 769] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 961] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1153] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1345] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 772] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 964] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1156] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1348] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 775] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 967] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1159] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1351] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 778] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 970] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1162] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1354] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 781] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 973] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1165] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1357] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 784] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 976] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1168] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1360] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 787] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 979] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1171] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1363] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 790] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 982] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1174] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1366] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 2] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 194] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 386] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 578] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 5] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 197] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 389] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 581] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 8] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 200] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 392] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 584] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 11] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 203] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 395] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 587] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 14] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 206] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 398] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 590] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 17] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 209] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 401] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 593] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 20] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 212] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 404] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 596] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 23] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 215] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 407] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 599] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 770] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 962] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1154] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1346] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 773] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 965] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1157] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1349] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 776] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 968] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1160] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1352] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 779] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 971] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1163] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1355] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 782] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 974] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1166] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1358] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 785] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 977] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1169] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1361] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 788] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 980] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1172] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1364] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 791] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 983] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1175] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1367] + +for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x = T.env_thread("threadIdx.x") + pad_temp_shared = T.Buffer((4032,), scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if threadIdx_x < 14: + pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) + threadIdx_x_1 = T.env_thread("threadIdx.x") + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + blockIdx_x = T.int32() + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 17: + kernel_shared[threadIdx_x_1 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + threadIdx_x_2 = T.int32() + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 192] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 384] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 576] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 3] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 195] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 387] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 579] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 6] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 198] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 390] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 582] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 9] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 201] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 393] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 585] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 12] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 204] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 396] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 588] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 15] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 207] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 399] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 591] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 18] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 210] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 402] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 594] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 21] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 213] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 405] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 597] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 768] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 960] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1152] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1344] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 771] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 963] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1155] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1347] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 774] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 966] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1158] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1350] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 777] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 969] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1161] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1353] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 780] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 972] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1164] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1356] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 783] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 975] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1167] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1359] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 786] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 978] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1170] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1362] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 789] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 981] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1173] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1365] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 193] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 385] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 577] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 4] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 196] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 388] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 580] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 7] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 199] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 391] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 583] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 10] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 202] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 394] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 586] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 13] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 205] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 397] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 589] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 16] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 208] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 400] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 592] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 19] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 211] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 403] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 595] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 22] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 214] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 406] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 598] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 769] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 961] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1153] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1345] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 772] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 964] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1156] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1348] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 775] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 967] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1159] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1351] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 778] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 970] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1162] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1354] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 781] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 973] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1165] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1357] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 784] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 976] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1168] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1360] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 787] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 979] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1171] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1363] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 790] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 982] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1174] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1366] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 2] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 194] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 386] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 578] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 5] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 197] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 389] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 581] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 8] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 200] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 392] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 584] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 11] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 203] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 395] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 587] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 14] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 206] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 398] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 590] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 17] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 209] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 401] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 593] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 20] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 212] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 404] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 596] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 23] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 215] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 407] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 599] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 770] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 962] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1154] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1346] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 773] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 965] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1157] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1349] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 776] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 968] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1160] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1352] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 779] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 971] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1163] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1355] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 782] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 974] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1166] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1358] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 785] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 977] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1169] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1361] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 788] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 980] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1172] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1364] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 791] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 983] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1175] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1367] + +0 + +8 + +i1_inner + +conv2d_nchw[i1_inner] + +8 + +blockIdx_x * 8 + +blockIdx_x * 8 + i1_inner + +bias[blockIdx_x * 8 + i1_inner] + +conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner] + +T.float32(0.0) + +T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +392 + +blockIdx_x * 392 + +49 + +i1_inner * 49 + +blockIdx_x * 392 + i1_inner * 49 + +blockIdx_x * 392 + i1_inner * 49 + threadIdx_x + +compute = T.Buffer((25088,)) +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +i1_inner = T.int32() +bias = T.Buffer((512,)) +blockIdx_x = T.int32() +threadIdx_x = T.int32() +compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +for i1_inner in range(8): + compute = T.Buffer((25088,)) + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + bias = T.Buffer((512,)) + blockIdx_x = T.int32() + threadIdx_x = T.int32() + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +conv2d_nchw = T.Buffer((8,), scope="local", align=32) +conv2d_nchw[0] = T.float32(0.0) +conv2d_nchw[1] = T.float32(0.0) +conv2d_nchw[2] = T.float32(0.0) +conv2d_nchw[3] = T.float32(0.0) +conv2d_nchw[4] = T.float32(0.0) +conv2d_nchw[5] = T.float32(0.0) +conv2d_nchw[6] = T.float32(0.0) +conv2d_nchw[7] = T.float32(0.0) +blockIdx_x = T.int32() +threadIdx_x_2 = T.int32() +for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x = T.env_thread("threadIdx.x") + pad_temp_shared = T.Buffer((4032,), scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x, 49): + if threadIdx_x < 14: + pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) + threadIdx_x_1 = T.env_thread("threadIdx.x") + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_1, 49): + kernel_shared[threadIdx_x_1 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 17: + kernel_shared[threadIdx_x_1 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 192] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 384] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 576] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 3] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 195] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 387] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 579] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 6] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 198] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 390] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 582] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 9] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 201] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 393] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 585] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 12] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 204] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 396] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 588] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 15] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 207] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 399] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 591] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 18] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 210] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 402] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 594] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 21] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 213] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 405] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 597] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 768] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 960] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1152] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1344] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 771] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 963] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1155] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1347] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 774] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 966] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1158] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1350] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 777] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 969] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1161] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1353] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 780] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 972] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1164] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1356] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 783] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 975] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1167] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1359] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 786] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 978] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1170] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1362] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 789] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 981] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1173] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1365] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 193] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 385] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 577] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 4] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 196] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 388] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 580] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 7] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 199] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 391] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 583] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 10] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 202] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 394] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 586] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 13] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 205] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 397] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 589] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 16] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 208] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 400] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 592] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 19] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 211] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 403] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 595] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 22] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 214] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 406] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 598] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 769] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 961] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1153] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1345] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 772] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 964] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1156] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1348] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 775] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 967] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1159] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1351] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 778] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 970] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1162] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1354] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 781] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 973] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1165] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1357] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 784] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 976] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1168] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1360] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 787] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 979] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1171] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1363] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 790] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 982] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1174] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1366] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 2] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 194] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 386] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 578] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 5] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 197] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 389] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 581] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 8] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 200] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 392] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 584] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 11] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 203] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 395] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 587] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 14] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 206] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 398] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 590] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 17] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 209] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 401] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 593] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 20] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 212] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 404] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 596] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 23] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 215] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 407] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 599] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 770] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 962] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1154] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1346] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 773] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 965] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1157] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1349] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 776] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 968] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1160] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1352] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 779] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 971] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1163] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1355] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 782] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 974] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1166] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1358] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 785] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 977] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1169] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1361] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 788] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 980] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1172] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1364] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 791] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 983] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1175] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1367] +for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x_2] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +with T.launch_thread("threadIdx.x", 49) as threadIdx_x: + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + conv2d_nchw[0] = T.float32(0.0) + conv2d_nchw[1] = T.float32(0.0) + conv2d_nchw[2] = T.float32(0.0) + conv2d_nchw[3] = T.float32(0.0) + conv2d_nchw[4] = T.float32(0.0) + conv2d_nchw[5] = T.float32(0.0) + conv2d_nchw[6] = T.float32(0.0) + conv2d_nchw[7] = T.float32(0.0) + blockIdx_x = T.int32() + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared = T.Buffer((4032,), scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 14: + pad_temp_shared[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared = T.Buffer((1536,), scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared[threadIdx_x_2 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_2, 49): + if threadIdx_x_2 < 17: + kernel_shared[threadIdx_x_2 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +T.bool(True) + +with T.allocate([1536], "float32", "shared") as kernel_shared: + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + conv2d_nchw[0] = T.float32(0.0) + conv2d_nchw[1] = T.float32(0.0) + conv2d_nchw[2] = T.float32(0.0) + conv2d_nchw[3] = T.float32(0.0) + conv2d_nchw[4] = T.float32(0.0) + conv2d_nchw[5] = T.float32(0.0) + conv2d_nchw[6] = T.float32(0.0) + conv2d_nchw[7] = T.float32(0.0) + blockIdx_x = T.int32() + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared = T.Buffer((4032,), scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 14: + pad_temp_shared[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_2, 49): + if threadIdx_x_2 < 17: + kernel_shared_1[threadIdx_x_2 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 192] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 384] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 576] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 3] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 195] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 387] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 579] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 6] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 198] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 390] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 582] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 9] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 201] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 393] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 585] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 12] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 204] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 396] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 588] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 15] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 207] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 399] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 591] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 18] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 210] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 402] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 594] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 21] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 213] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 405] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 597] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 768] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 960] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1152] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1344] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 771] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 963] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1155] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1347] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 774] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 966] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1158] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1350] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 777] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 969] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1161] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1353] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 780] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 972] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1164] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1356] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 783] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 975] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1167] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1359] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 786] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 978] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1170] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1362] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 789] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 981] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1173] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1365] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 193] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 385] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 577] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 4] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 196] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 388] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 580] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 7] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 199] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 391] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 583] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 10] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 202] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 394] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 586] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 13] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 205] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 397] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 589] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 16] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 208] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 400] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 592] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 19] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 211] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 403] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 595] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 22] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 214] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 406] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 598] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 769] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 961] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1153] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1345] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 772] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 964] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1156] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1348] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 775] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 967] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1159] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1351] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 778] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 970] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1162] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1354] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 781] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 973] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1165] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1357] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 784] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 976] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1168] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1360] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 787] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 979] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1171] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1363] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 790] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 982] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1174] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1366] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 2] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 194] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 386] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 578] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 5] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 197] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 389] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 581] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 8] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 200] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 392] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 584] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 11] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 203] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 395] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 587] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 14] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 206] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 398] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 590] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 17] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 209] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 401] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 593] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 20] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 212] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 404] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 596] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 23] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 215] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 407] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 599] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 770] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 962] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1154] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1346] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 773] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 965] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1157] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1349] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 776] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 968] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1160] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1352] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 779] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 971] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1163] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1355] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 782] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 974] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1166] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1358] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 785] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 977] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1169] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1361] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 788] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 980] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1172] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1364] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 791] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 983] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1175] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1367] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +T.bool(True) + +with T.allocate([4032], "float32", "shared") as pad_temp_shared: + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw = T.Buffer((8,), scope="local", align=32) + conv2d_nchw[0] = T.float32(0.0) + conv2d_nchw[1] = T.float32(0.0) + conv2d_nchw[2] = T.float32(0.0) + conv2d_nchw[3] = T.float32(0.0) + conv2d_nchw[4] = T.float32(0.0) + conv2d_nchw[5] = T.float32(0.0) + conv2d_nchw[6] = T.float32(0.0) + conv2d_nchw[7] = T.float32(0.0) + blockIdx_x = T.int32() + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 14: + pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_2, 49): + if threadIdx_x_2 < 17: + kernel_shared_1[threadIdx_x_2 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 192] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 384] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 576] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 3] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 195] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 387] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 579] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 6] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 198] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 390] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 582] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 9] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 201] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 393] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 585] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 12] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 204] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 396] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 588] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 15] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 207] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 399] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 591] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 18] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 210] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 402] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 594] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 21] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 213] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 405] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 597] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 768] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 960] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1152] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1344] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 771] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 963] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1155] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1347] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 774] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 966] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1158] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1350] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 777] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 969] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1161] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1353] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 780] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 972] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1164] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1356] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 783] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 975] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1167] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1359] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 786] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 978] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1170] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1362] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 789] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 981] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1173] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1365] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 193] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 385] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 577] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 4] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 196] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 388] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 580] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 7] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 199] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 391] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 583] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 10] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 202] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 394] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 586] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 13] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 205] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 397] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 589] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 16] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 208] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 400] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 592] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 19] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 211] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 403] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 595] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 22] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 214] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 406] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 598] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 769] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 961] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1153] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1345] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 772] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 964] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1156] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1348] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 775] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 967] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1159] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1351] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 778] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 970] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1162] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1354] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 781] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 973] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1165] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1357] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 784] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 976] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1168] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1360] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 787] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 979] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1171] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1363] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 790] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 982] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1174] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1366] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 2] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 194] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 386] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 578] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 5] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 197] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 389] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 581] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 8] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 200] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 392] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 584] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 11] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 203] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 395] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 587] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 14] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 206] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 398] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 590] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 17] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 209] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 401] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 593] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 20] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 212] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 404] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 596] + conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 23] + conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 215] + conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 407] + conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 599] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 770] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 962] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1154] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1346] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 773] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 965] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1157] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1349] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 776] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 968] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1160] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1352] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 779] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 971] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1163] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1355] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 782] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 974] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1166] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1358] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 785] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 977] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1169] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1361] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 788] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 980] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1172] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1364] + conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 791] + conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 983] + conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1175] + conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1367] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +T.bool(True) + +with T.allocate([8], "float32", "local") as conv2d_nchw: + pad_temp_shared = T.allocate([4032], "float32", "shared") + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + conv2d_nchw_1[0] = T.float32(0.0) + conv2d_nchw_1[1] = T.float32(0.0) + conv2d_nchw_1[2] = T.float32(0.0) + conv2d_nchw_1[3] = T.float32(0.0) + conv2d_nchw_1[4] = T.float32(0.0) + conv2d_nchw_1[5] = T.float32(0.0) + conv2d_nchw_1[6] = T.float32(0.0) + conv2d_nchw_1[7] = T.float32(0.0) + blockIdx_x = T.int32() + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 14: + pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_2, 49): + if threadIdx_x_2 < 17: + kernel_shared_1[threadIdx_x_2 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 192] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 384] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 576] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 195] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 387] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 579] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 6] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 198] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 390] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 582] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 9] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 201] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 393] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 585] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 12] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 204] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 396] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 588] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 15] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 207] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 399] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 591] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 18] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 210] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 402] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 594] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 21] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 213] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 405] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 597] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 768] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 960] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1152] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1344] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 771] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 963] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1155] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1347] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 774] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 966] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1158] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1350] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 777] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 969] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1161] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1353] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 780] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 972] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1164] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1356] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 783] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 975] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1167] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1359] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 786] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 978] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1170] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1362] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 789] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 981] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1173] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1365] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 193] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 385] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 577] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 4] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 196] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 388] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 580] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 7] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 199] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 391] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 583] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 10] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 202] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 394] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 586] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 13] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 205] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 397] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 589] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 16] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 208] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 400] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 592] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 19] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 211] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 403] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 595] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 22] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 214] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 406] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 598] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 769] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 961] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1153] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1345] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 772] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 964] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1156] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1348] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 775] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 967] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1159] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1351] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 778] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 970] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1162] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1354] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 781] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 973] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1165] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1357] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 784] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 976] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1168] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1360] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 787] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 979] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1171] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1363] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 790] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 982] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1174] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1366] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 2] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 194] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 386] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 578] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 5] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 197] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 389] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 581] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 8] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 200] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 392] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 584] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 11] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 203] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 395] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 587] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 14] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 206] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 398] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 590] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 17] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 209] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 401] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 593] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 20] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 212] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 404] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 596] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 23] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 215] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 407] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 599] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 770] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 962] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1154] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1346] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 773] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 965] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1157] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1349] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 776] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 968] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1160] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1352] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 779] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 971] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1163] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1355] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 782] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 974] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1166] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1358] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 785] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 977] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1169] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1361] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 788] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 980] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1172] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1364] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 791] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 983] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1175] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1367] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +with T.launch_thread("blockIdx.x", 64) as blockIdx_x: + conv2d_nchw = T.allocate([8], "float32", "local") + pad_temp_shared = T.allocate([4032], "float32", "shared") + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + conv2d_nchw_1[0] = T.float32(0.0) + conv2d_nchw_1[1] = T.float32(0.0) + conv2d_nchw_1[2] = T.float32(0.0) + conv2d_nchw_1[3] = T.float32(0.0) + conv2d_nchw_1[4] = T.float32(0.0) + conv2d_nchw_1[5] = T.float32(0.0) + conv2d_nchw_1[6] = T.float32(0.0) + conv2d_nchw_1[7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data = T.Buffer((25088,)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 14: + pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel = T.Buffer((2359296,)) + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_2, 49): + if threadIdx_x_2 < 17: + kernel_shared_1[threadIdx_x_2 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 192] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 384] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 576] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 195] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 387] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 579] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 6] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 198] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 390] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 582] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 9] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 201] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 393] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 585] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 12] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 204] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 396] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 588] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 15] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 207] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 399] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 591] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 18] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 210] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 402] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 594] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 21] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 213] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 405] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 597] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 768] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 960] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1152] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1344] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 771] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 963] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1155] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1347] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 774] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 966] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1158] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1350] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 777] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 969] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1161] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1353] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 780] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 972] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1164] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1356] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 783] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 975] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1167] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1359] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 786] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 978] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1170] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1362] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 789] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 981] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1173] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1365] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 193] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 385] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 577] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 4] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 196] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 388] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 580] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 7] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 199] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 391] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 583] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 10] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 202] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 394] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 586] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 13] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 205] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 397] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 589] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 16] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 208] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 400] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 592] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 19] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 211] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 403] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 595] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 22] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 214] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 406] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 598] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 769] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 961] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1153] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1345] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 772] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 964] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1156] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1348] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 775] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 967] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1159] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1351] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 778] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 970] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1162] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1354] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 781] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 973] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1165] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1357] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 784] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 976] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1168] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1360] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 787] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 979] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1171] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1363] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 790] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 982] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1174] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1366] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 2] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 194] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 386] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 578] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 5] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 197] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 389] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 581] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 8] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 200] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 392] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 584] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 11] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 203] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 395] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 587] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 14] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 206] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 398] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 590] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 17] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 209] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 401] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 593] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 20] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 212] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 404] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 596] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 23] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 215] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 407] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 599] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 770] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 962] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1154] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1346] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 773] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 965] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1157] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1349] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 776] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 968] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1160] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1352] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 779] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 971] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1163] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1355] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 782] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 974] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1166] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1358] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 785] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 977] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1169] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1361] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 788] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 980] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1172] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1364] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 791] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 983] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1175] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1367] + for i1_inner in range(8): + compute = T.Buffer((25088,)) + bias = T.Buffer((512,)) + compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +# from tvm.script import ir as I +# from tvm.script import tir as T + +@I.ir_module +class Module: + @T.prim_func + def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")): + T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)}) + blockIdx_x = T.launch_thread("blockIdx.x", 64) + conv2d_nchw = T.allocate([8], "float32", "local") + pad_temp_shared = T.allocate([4032], "float32", "shared") + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + conv2d_nchw_1[0] = T.float32(0.0) + conv2d_nchw_1[1] = T.float32(0.0) + conv2d_nchw_1[2] = T.float32(0.0) + conv2d_nchw_1[3] = T.float32(0.0) + conv2d_nchw_1[4] = T.float32(0.0) + conv2d_nchw_1[5] = T.float32(0.0) + conv2d_nchw_1[6] = T.float32(0.0) + conv2d_nchw_1[7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + cse_var_2: T.int32 = rc_outer_outer * 3136 + cse_var_1: T.int32 = rc_outer_outer * 576 + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data_1 = T.Buffer((25088,), data=data.data) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if threadIdx_x_1 < 14: + pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel_1 = T.Buffer((2359296,), data=kernel.data) + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 49] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 98] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 147] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 196] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 245] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 294] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 343] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 441] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 490] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 539] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 588] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 637] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 686] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 735] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 833] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 882] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 931] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 980] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1029] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1078] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1127] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1176] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1225] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1274] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1323] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1372] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1421] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1470] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_2, 49): + if threadIdx_x_2 < 17: + kernel_shared_1[threadIdx_x_2 + 1519] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + cse_var_3: T.int32 = rc_outer_inner * 24 + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 192] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 384] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 576] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 195] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 387] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 579] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 6] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 198] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 390] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 582] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 9] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 201] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 393] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 585] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 12] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 204] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 396] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 588] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 15] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 207] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 399] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 591] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 18] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 210] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 402] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 594] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 21] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 213] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 405] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 597] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 768] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 960] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1152] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1344] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 771] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 963] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1155] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1347] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 774] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 966] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1158] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1350] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 777] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 969] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1161] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1353] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 780] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 972] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1164] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1356] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 783] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 975] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1167] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1359] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 786] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 978] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1170] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1362] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 789] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 981] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1173] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1365] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 193] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 385] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 577] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 4] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 196] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 388] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 580] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 7] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 199] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 391] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 583] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 10] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 202] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 394] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 586] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 13] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 205] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 397] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 589] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 16] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 208] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 400] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 592] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 19] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 211] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 403] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 595] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 22] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 214] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 406] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 598] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 769] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 961] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1153] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1345] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 772] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 964] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1156] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1348] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 775] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 967] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1159] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1351] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 778] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 970] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1162] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1354] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 781] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 973] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1165] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1357] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 784] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 976] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1168] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1360] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 787] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 979] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1171] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1363] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 790] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 982] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1174] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1366] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 2] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 194] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 386] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 578] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 5] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 197] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 389] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 581] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 8] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 200] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 392] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 584] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 11] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 203] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 395] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 587] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 14] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 206] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 398] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 590] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 17] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 209] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 401] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 593] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 20] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 212] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 404] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 596] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 23] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 215] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 407] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 599] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 770] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 962] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1154] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1346] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 773] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 965] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1157] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1349] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 776] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 968] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1160] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1352] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 779] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 971] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1163] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1355] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 782] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 974] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1166] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1358] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 785] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 977] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1169] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1361] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 788] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 980] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1172] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1364] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 791] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 983] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1175] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1367] + for i1_inner in range(8): + compute_1 = T.Buffer((25088,), data=compute.data) + bias_1 = T.Buffer((512,), data=bias.data) + compute_1[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 8 + i1_inner], T.float32(0.0)) +Execution time of this operator: 0.101 ms +Equivalent python schedule: +pad_temp_i0, pad_temp_i1, pad_temp_i2, pad_temp_i3 = tuple(pad_temp.op.axis) + tuple(pad_temp.op.reduce_axis) +conv2d_nchw_nn, conv2d_nchw_ff, conv2d_nchw_yy, conv2d_nchw_xx, conv2d_nchw_rc, conv2d_nchw_ry, conv2d_nchw_rx = tuple(conv2d_nchw.op.axis) + tuple(conv2d_nchw.op.reduce_axis) +T_add_ax0, T_add_ax1, T_add_ax2, T_add_ax3 = tuple(T_add.op.axis) + tuple(T_add.op.reduce_axis) +compute_i0, compute_i1, compute_i2, compute_i3 = tuple(compute.op.axis) + tuple(compute.op.reduce_axis) +s[T_add].compute_inline() +conv2d_nchw_nn_o_i, conv2d_nchw_nn_i = s[conv2d_nchw].split(conv2d_nchw_nn, factor=1) +conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1) +conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1) +conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1) +conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=4) +conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2) +conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=1) +conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1) +conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1) +conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1) +conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7) +conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1) +conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1) +conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1) +conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7) +conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1) +conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=8) +conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8) +conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1) +conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3) +conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1) +conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1) +s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2d_nchw_xx_o_i, conv2d_nchw_rc_i, conv2d_nchw_ry_i, conv2d_nchw_rx_i, conv2d_nchw_nn_i, conv2d_nchw_ff_i, conv2d_nchw_yy_i, conv2d_nchw_xx_i) +compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1) +compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1) +compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1) +compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=8) +compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=1) +compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1) +compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1) +compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7) +compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1) +compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1) +compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7) +compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1) +s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i) +s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i) +kernel_shared = s.cache_read(kernel, "shared", [conv2d_nchw]) +kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3 = tuple(kernel_shared.op.axis) +s[kernel_shared].compute_at(s[conv2d_nchw], conv2d_nchw_rx_o_o) +pad_temp_shared = s.cache_read(pad_temp, "shared", [conv2d_nchw]) +pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3 = tuple(pad_temp_shared.op.axis) +s[pad_temp_shared].compute_at(s[conv2d_nchw], conv2d_nchw_rx_o_o) +s[pad_temp].compute_inline() +compute_i0_o_o_o_i1_o_o_o_fused_i2_o_o_o_fused_i3_o_o_o_fused = s[compute].fuse(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o) +s[compute].bind(compute_i0_o_o_o_i1_o_o_o_fused_i2_o_o_o_fused_i3_o_o_o_fused, te.thread_axis("blockIdx.x")) +compute_i0_o_o_i_i1_o_o_i_fused_i2_o_o_i_fused_i3_o_o_i_fused = s[compute].fuse(compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i) +s[compute].bind(compute_i0_o_o_i_i1_o_o_i_fused_i2_o_o_i_fused_i3_o_o_i_fused, te.thread_axis("vthread")) +compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i) +s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis("threadIdx.x")) +kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3) +kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1) +s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i) +kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=49) +s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x")) +pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3) +pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1) +s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i) +pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=49) +s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x")) +s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 1024) +s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True) + diff --git a/llpass.py b/llpass.py new file mode 100644 index 000000000000..35283421970a --- /dev/null +++ b/llpass.py @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Writing a Customized Pass +========================= +**Author**: `Jian Weng `_ + +TVM is a framework that abstracts away the heterogenity of machine learning accelerators. +Sometimes users may want customize some analysis and IR transformations +to adapt TVM to their own specialized hardware. This tutorial helps users write +a customized pass in TVM. + +Prerequisites +------------- + +Before reading this tutorial, we assume readers have already known these topics well: + +- Writing an algorithm in TVM and schedule it. Otherwise, see example tutorials like + :ref:`opt-gemm`. +- The basic structure of HalideIR. Otherwise, see ``HalideIR/src/ir/IR.h`` to learn what + attributes of IR nodes are defined. +- Visitor design pattern. Otherwise, check the + `Python AST module `_ to see how an AST + visitor is implemented. +- How a Schedule is lowered to either an IRModule class or a LLVM module. Otherwise, + take a look at ``python/tvm/build_module.py`` to get some basics. + +""" + +import tvm +from tvm import te +import numpy as np + +###################################################################### +# We first write a very simple vector add and build it with the default schedule. Then, we use +# our customized lowering pass to manipulate the IR directly instead of using schedule primitives. +# + +n = tvm.tir.const(128, "int32") +a = te.placeholder((n,), name="a") +b = te.placeholder((n,), name="b") +c = te.compute((n,), lambda i: a[i] + b[i], name="c") + +sch = te.create_schedule(c.op) +ir = tvm.lower(sch, [a, b, c]) +print(ir) + +###################################################################### +# Writing a Pass +# -------------- +# Essentially, an "IR transformation pass" is a function which maps a statement to a new statement. +# Thus, we define this vectorize function and implement it step by step. +# + +###################################################################### +# TVM already provides two class for users to both analyze and transform IR. +# +# IR Visitor +# ~~~~~~~~~~ +# We can use ``tvm.tir.stmt_functor.post_order_visit(stmt, func)`` to gather information from the Halide IR. +# ``func`` is a function callback. This function will be called before exiting the current IR node, +# i.e. post-order visit. Then we leverage side effects to store the result of IR visit, because the +# return value of ``func`` will be ignored. +# +# .. note:: +# +# You MUST use some array to store the result of IR visit. Even the value is a single variable. +# This is mainly due to the constraints in the Python-C runtime. The variable values will be +# refreshed every recursion but the array values will be preserved. +# + + +def find_width8(op): + + print(op) + print(type(op)) + +##################################################################### +# IR Transformation +# ~~~~~~~~~~~~~~~~~ +# The transformation interface is slightly different from the visitor interface. There is only a +# post-order callback in the visitor, but transformation visitor supports both a pre-order and a +# post-order callback. If you want to keep the origin IR node, just return None. If you want to +# change the current node to some node, use TVM IR maker interface to build it and return +# this value. +# +# .. note:: +# +# If the pre-order function is called and returns a value which is not None, the post-order +# function will be skipped. +# + + +def vectorize8(op): + """Split can vectorize the loops found in `find_width8`.""" + if op in loops: + extent = op.extent.value + name = op.loop_var.name + lo, li = te.var(name + ".outer"), te.var(name + ".inner") + body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li}) + body = tvm.tir.For(li, 0, 8, tvm.tir.ForKind.VECTORIZED, body) + body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.ForKind.SERIAL, body) + return body + return None + + +@tvm.tir.transform.prim_func_pass(opt_level=0) +def vectorize(f, mod, ctx): + tvm.tir.stmt_functor.post_order_visit(f.body, find_width8) + return f + + +##################################################################### +# Glue to Lowering +# ---------------- +# So far, we are done with writing this IR transformation pass. What we need to do next is to glue +# this pass to TVM's lower pass. +# +# In this case, we inject the pass written above into the TVM standard lowering +# pass by feeding **a list of tuple** as argument to ``tir.add_lower_pass``. "Tuple" indicates different +# phases of lowering. In TVM, there are four phases of lowering and user-customized ones will be +# called after each phase is done. +# +# .. note:: +# Here are the essential transformations done by each phase: +# - Phase 0 generates the raw IR and loop levels. +# - Phase 1 flattens the array storage. +# - Phase 2 transforms loops, like unroll, vectorization and thread-binding. +# - Phase 3 does some cleanup work. +# +# Thus, a good place to put this transformation pass is just after Phase 1. +# + +for i in range(4): + print(f"Phase {i}") + print("-" * 20) + with tvm.transform.PassContext(config={"tir.add_lower_pass": [(i, vectorize)]}): + print(tvm.lower(sch, [a, b, c])) + +##################################################################### +# Quick View +# ---------- +# This tutorial gives a quick view of writing a customized IR transformation pass: +# - Use ``tvm.tir.stmt_functor.post_order_visit`` to gather information on each IR nodes. +# - Use ``tvm.tir.stmt_functor.ir_transform`` to transform IR nodes. +# - Wrap up two above to write an IR-transformation function. +# - Use ``tvm.transform.PassContext`` to put this function to TVM lowering pass +# \ No newline at end of file diff --git a/lowered_tir.py b/lowered_tir.py new file mode 100644 index 000000000000..e26cd3a2dab2 --- /dev/null +++ b/lowered_tir.py @@ -0,0 +1,481 @@ +from tvm.script import ir as I +from tvm.script import tir as T + +@I.ir_module +class Module: + @T.prim_func + def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")): + T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)}) + blockIdx_x = T.launch_thread("blockIdx.x", 64) + conv2d_nchw = T.allocate([8], "float32", "local") + pad_temp_shared = T.allocate([4032], "float32", "shared") + kernel_shared = T.allocate([1536], "float32", "shared") + threadIdx_x = T.launch_thread("threadIdx.x", 49) + conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) + conv2d_nchw_1[0] = T.float32(0.0) + conv2d_nchw_1[1] = T.float32(0.0) + conv2d_nchw_1[2] = T.float32(0.0) + conv2d_nchw_1[3] = T.float32(0.0) + conv2d_nchw_1[4] = T.float32(0.0) + conv2d_nchw_1[5] = T.float32(0.0) + conv2d_nchw_1[6] = T.float32(0.0) + conv2d_nchw_1[7] = T.float32(0.0) + for rc_outer_outer, rx_outer_outer in T.grid(8, 3): + cse_var_2: T.int32 = rc_outer_outer * 3136 + cse_var_1: T.int32 = rc_outer_outer * 576 + threadIdx_x_1 = T.env_thread("threadIdx.x") + pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") + data_1 = T.Buffer((25088,), data=data.data) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) + with T.launch_thread(threadIdx_x_1, 49): + if T.likely(threadIdx_x_1 < 14): + pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) + threadIdx_x_2 = T.env_thread("threadIdx.x") + kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") + kernel_1 = T.Buffer((2359296,), data=kernel.data) + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 49] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 147] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 98] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 294] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 147] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 196] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 12] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 245] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 159] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 294] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 306] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 343] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 24] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 441] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 171] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 490] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 318] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 539] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 588] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 36] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 637] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 183] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 686] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 330] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 735] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 48] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 833] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 195] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 882] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 342] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 931] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 980] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 60] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1029] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 207] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1078] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 354] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1127] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1176] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 72] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1225] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 219] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1274] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 366] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1323] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1372] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 84] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1421] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 231] + with T.launch_thread(threadIdx_x_2, 49): + kernel_shared_1[threadIdx_x_2 + 1470] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 378] + with T.launch_thread(threadIdx_x_2, 49): + if T.likely(threadIdx_x_2 < 17): + kernel_shared_1[threadIdx_x_2 + 1519] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 525] + for rc_outer_inner in range(8): + cse_var_3: T.int32 = rc_outer_inner * 24 + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 192] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 384] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 576] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 3] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 195] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 387] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 579] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 6] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 198] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 390] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 582] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 9] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 201] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 393] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 585] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 12] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 204] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 396] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 588] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 15] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 207] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 399] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 591] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 18] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 210] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 402] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 594] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 21] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 213] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 405] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 597] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 768] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 960] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1152] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1344] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 771] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 963] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1155] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1347] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 774] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 966] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1158] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1350] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 777] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 969] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1161] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1353] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 780] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 972] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1164] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1356] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 783] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 975] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1167] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1359] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 786] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 978] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1170] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1362] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 789] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 981] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1173] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1365] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 193] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 385] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 577] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 4] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 196] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 388] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 580] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 7] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 199] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 391] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 583] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 10] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 202] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 394] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 586] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 13] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 205] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 397] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 589] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 16] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 208] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 400] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 592] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 19] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 211] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 403] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 595] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 22] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 214] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 406] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 598] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 769] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 961] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1153] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1345] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 772] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 964] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1156] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1348] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 775] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 967] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1159] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1351] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 778] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 970] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1162] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1354] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 781] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 973] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1165] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1357] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 784] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 976] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1168] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1360] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 787] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 979] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1171] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1363] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 790] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 982] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1174] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1366] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 2] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 194] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 386] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 578] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 5] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 197] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 389] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 581] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 8] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 200] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 392] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 584] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 11] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 203] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 395] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 587] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 14] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 206] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 398] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 590] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 17] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 209] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 401] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 593] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 20] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 212] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 404] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 596] + conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 23] + conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 215] + conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 407] + conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 599] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 770] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 962] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1154] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1346] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 773] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 965] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1157] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1349] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 776] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 968] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1160] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1352] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 779] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 971] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1163] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1355] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 782] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 974] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1166] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1358] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 785] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 977] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1169] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1361] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 788] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 980] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1172] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1364] + conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 791] + conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 983] + conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1175] + conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1367] + for i1_inner in range(8): + compute_1 = T.Buffer((25088,), data=compute.data) + bias_1 = T.Buffer((512,), data=bias.data) + compute_1[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 8 + i1_inner], T.float32(0.0)) + +import tvm +from tvm import te, tir, IRModule + +mod: IRModule = Module +print(mod.script()) + +# get stmt +from tvm.tir.stmt_functor import ir_transform, post_order_visit +stmt = mod["main"] +print(stmt) + +# use post_order_visit to get all the stmt +# make a function to visit the stmt +def visit_stmt(stmt): + print("visit stmt") + print(type(stmt)) + + +tvm.tir.round() \ No newline at end of file diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py index c7cdb15634e1..d7a977c0599b 100644 --- a/python/tvm/auto_scheduler/cost_model/xgb_model.py +++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py @@ -28,6 +28,9 @@ from .cost_model import PythonBasedModel from ..feature import get_per_store_features_from_measure_pairs, get_per_store_features_from_states from ..measure_record import RecordReader +# from ..search_task import SearchTask +import tvm.te as te +import tvm try: from xgboost.callback import TrainingCallback # type: ignore @@ -235,6 +238,36 @@ def predict(self, task, states): scores: List[float] The predicted scores for all states """ + + # print tasks and states + print("XGBModel: predict") + print("task") + print(type(task)) + # print(len(task)) + print("states") + print(type(states)) + print(len(states)) + print("states[0]") + print(type(states[0])) + # print(states[0]) + + # apply the state transformations + task: SearchTask + schedule, args = task.compute_dag.apply_steps_from_state(states[0]) + schedule: te.Schedule + mod = tvm.lower(schedule, args) + print("mod") + print(type(mod)) + print(mod) + + print("schedule") + print(type(schedule)) + print(schedule) + + + + + features = get_per_store_features_from_states(states, task) if self.bst is not None and len(self.inputs) > self.num_warmup_sample: dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features) diff --git a/tune_conv2d_layer_cuda.py b/tune_conv2d_layer_cuda.py new file mode 100644 index 000000000000..ef5d2b4f979b --- /dev/null +++ b/tune_conv2d_layer_cuda.py @@ -0,0 +1,226 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +.. _auto-scheduler-conv-gpu: + +Auto-scheduling a Convolution Layer for GPU +=========================================== +**Author**: `Lianmin Zheng `_, \ + `Chengfan Jia `_ + +This is a tutorial on how to use the auto-scheduler for GPUs. + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any templates. +Users only need to write the computation declaration without any schedule commands or templates. +The auto-scheduler can automatically generate a large search space and +find a good schedule in the space. + +We use a convolution layer as an example in this tutorial. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +# sphinx_gallery_start_ignore +# sphinx_gallery_requires_cuda = True +# sphinx_gallery_end_ignore +import os + +import numpy as np +import tvm +from tvm import te, auto_scheduler, topi +from tvm.topi.testing import conv2d_nchw_python + +###################################################################### +# Define the computation +# ^^^^^^^^^^^^^^^^^^^^^^ +# To begin with, let us define the computation of a convolution layer. +# The function should return the list of input/output tensors. +# From these tensors, the auto-scheduler can get the whole computational graph. + + +@auto_scheduler.register_workload +def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding): + data = te.placeholder((N, CI, H, W), name="data") + kernel = te.placeholder((CO, CI, KH, KW), name="kernel") + bias = te.placeholder((1, CO, 1, 1), name="bias") + conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32") + out = topi.nn.relu(conv + bias) + return [data, kernel, bias, out] + + +###################################################################### +# Create the search task +# ^^^^^^^^^^^^^^^^^^^^^^ +# We then create a search task for the last convolution layer in the resnet. + +target = tvm.target.Target("cuda") + +# Use the last layer in ResNet-50 +N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) +task = auto_scheduler.SearchTask( + func=conv2d_layer, args=(N, H, W, CO, CI, KH, KW, strides, padding), target=target +) + +# Inspect the computational graph +print("Computational DAG:") +print(task.compute_dag) + +###################################################################### +# Next, we set parameters for the auto-scheduler. These parameters +# mainly specify how we do the measurement during the search. +# +# * :code:`measure_ctx` launches a different process for measurement to +# provide isolation. It can protect the main process from GPU crashes +# during measurement and avoid other runtime conflicts. +# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. +# This can warmup the GPU, which is necessary to get accurate measurement results. +# Typically, we recommend a value >= 300 ms. +# * :code:`num_measure_trials` is the number of measurement trials we can use during the search. +# We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a +# good value for the search to converge. You can do more trials according to your time budget. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `conv2d.json`. +# The measurement records can be used to query the history best, resume the search, +# and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions`, +# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. + +log_file = "conv2d.json" +measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) +tune_option = auto_scheduler.TuningOptions( + num_measure_trials=10, # change this to 1000 to achieve the best performance + runner=measure_ctx.runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + verbose=2, +) + +###################################################################### +# Run the search +# ^^^^^^^^^^^^^^ +# Now we get all inputs ready. Pretty simple, isn't it? +# We can kick off the search and let the auto-scheduler do its magic. +# After some measurement trials, we can load the best schedule from the log +# file and apply it. + +# Run auto-tuning (search) +# We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. +task.tune(tune_option) +# Apply the best schedule +sch, args = task.apply_best(log_file) +sch: te.Schedule + +# lower schedule to IR +mod = tvm.lower(sch, args, simple_mode=True) + +# Print the tree using relay viz +from tvm.contrib import relay_viz +viz = relay_viz.RelayVisualizer(mod) +viz.render() + +# Kill the measurement process +del measure_ctx + +###################################################################### +# We can lower the schedule to see the IR after auto-scheduling. +# The auto-scheduler correctly performs optimizations including multi-level tiling, +# cooperative fetching, unrolling and operator fusion. + +# print("Lowered TIR:") +# print(tvm.lower(sch, args, simple_mode=True)) + + +###################################################################### +# Check correctness and evaluate performance +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# We build the binary and check its correctness and performance. + +func = tvm.build(sch, args, target) + +# Check correctness +data_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) +weight_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) +bias_np = np.random.uniform(size=(1, CO, 1, 1)).astype(np.float32) +conv_np = conv2d_nchw_python(data_np, weight_np, strides, padding) +out_np = np.maximum(conv_np + bias_np, 0.0) + +dev = tvm.cuda() +data_tvm = tvm.nd.array(data_np, device=dev) +weight_tvm = tvm.nd.array(weight_np, device=dev) +bias_tvm = tvm.nd.array(bias_np, device=dev) +out_tvm = tvm.nd.empty(out_np.shape, device=dev) +func(data_tvm, weight_tvm, bias_tvm, out_tvm) + +# Check results +np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3) + +# Evaluate execution time +evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) +print( + "Execution time of this operator: %.3f ms" + % (np.median(evaluator(data_tvm, weight_tvm, bias_tvm, out_tvm).results) * 1000) +) + +###################################################################### +# Using the record file +# ^^^^^^^^^^^^^^^^^^^^^ +# During the search, all measurement records are dumped into the record +# file "conv2d.json". The measurement records can be used to re-apply search results, +# resume the search, and perform other analyses. + +###################################################################### +# Here is an example where we load the best schedule from a file, +# print the equivalent python schedule API and CUDA source code. +# They can be used for debugging and learning the behavior of the auto-scheduler. + +print("Equivalent python schedule:") +print(task.print_best(log_file, print_mode="schedule")) + +# print("CUDA source code:") +# print(task.print_best(log_file, print_mode="cuda")) + +###################################################################### +# A more complicated example is to resume the search. +# In this case, we need to create the search policy and cost model by ourselves +# and resume the status of search policy and cost model with the log file. +# In the example below we resume the status and do more 5 trials. + + +def resume_search(task, log_file): + print("Resume search:") + cost_model = auto_scheduler.XGBModel() + cost_model.update_from_file(log_file) + search_policy = auto_scheduler.SketchPolicy( + task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)] + ) + measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=5, + runner=measure_ctx.runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + ) + task.tune(tune_option, search_policy=search_policy) + + # Kill the measurement process + del measure_ctx + + +# We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. +# resume_search(task, log_file) From 66b505c3877bd258ac964bd87b9fc15cf37dab78 Mon Sep 17 00:00:00 2001 From: Dwijen Chawra Date: Tue, 3 Dec 2024 20:22:01 -0500 Subject: [PATCH 2/8] adding gnnmodel to auto_scheduler namespace --- python/tvm/auto_scheduler/__init__.py | 2 +- .../auto_scheduler/cost_model/gnn_model.py | 681 ++++++++++++++++++ 2 files changed, 682 insertions(+), 1 deletion(-) create mode 100644 python/tvm/auto_scheduler/cost_model/gnn_model.py diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py index 97ac323662bb..5b80841a8cac 100644 --- a/python/tvm/auto_scheduler/__init__.py +++ b/python/tvm/auto_scheduler/__init__.py @@ -38,7 +38,7 @@ LayoutRewriteOption, get_shape_from_rewritten_layout, ) -from .cost_model import RandomModel, XGBModel +from .cost_model import RandomModel, XGBModel, GNNModel from .dispatcher import ApplyHistoryBest, ApplyHistoryBestOrSample, DispatchContext from .measure import ( LocalBuilder, diff --git a/python/tvm/auto_scheduler/cost_model/gnn_model.py b/python/tvm/auto_scheduler/cost_model/gnn_model.py new file mode 100644 index 000000000000..97b54d429d88 --- /dev/null +++ b/python/tvm/auto_scheduler/cost_model/gnn_model.py @@ -0,0 +1,681 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name + +"""Cost model based on xgboost""" +import multiprocessing +import logging +from typing import Dict +from collections import defaultdict + +import numpy as np + +from tvm.autotvm.tuner.metric import max_curve +from .cost_model import PythonBasedModel +from ..feature import get_per_store_features_from_measure_pairs, get_per_store_features_from_states +from ..measure_record import RecordReader +# from ..search_task import SearchTask +import tvm.te as te +import tvm +import networkx as nx +import matplotlib.pyplot as plt +from tvm.relay import ExprVisitor + + +try: + from xgboost.callback import TrainingCallback # type: ignore +except ImportError: + + class TrainingCallback: # type: ignore + pass + + +xgb = None + +logger = logging.getLogger("auto_scheduler") + +class GraphBuilder(ExprVisitor): + def __init__(self): + super().__init__() + self.graph = nx.DiGraph() # Create a directed graph + self.current_node = None # To keep track of the current node being visited + + def visit(self, expr): + # to start just print the type of the expression + print(type(expr)) + super().visit(expr) + +class GNNCostModel(PythonBasedModel): + """Train a GNN model that learns from the AST representation of a TIR program + and predicts the performance of the program. + + This model takes in a state and a task, and instead of computing the features from the state, it will + convert the state into TE, lower it to TIR, then parse the TIR to get the AST representation. + + Then we take each node in the AST, and convert it into a learned embedding. + + We then pass these embeddings into a GNN model, which will output a prediction of the performance of the program. + """ + + def __init__( + self, + verbose_eval=25, + num_warmup_sample=100, + seed=None, + model_file=None, + adaptive_training=False, + ): + global xgb + try: + if xgb is None: + xgb = __import__("xgboost") + except ImportError: + # add "from Node" to silence + # "During handling of the above exception, another exception occurred" + raise ImportError( + "XGBoost is required for XGBModel. " + "Please install its python package first. " + "Help: (https://xgboost.readthedocs.io/en/latest/) " + ) from None + + self.xgb_params = { + "max_depth": 10, + "gamma": 0.001, + "min_child_weight": 0, + "eta": 0.2, + # todo(merrymercy): automatically decrease learning rate when the loss is too large + "n_gpus": 0, + "nthread": multiprocessing.cpu_count() // 2, + "verbosity": 0, + "seed": seed or 43, + "disable_default_eval_metric": 1, + } + self.bst = None + self.plan_size = 32 + self.num_warmup_sample = num_warmup_sample + self.verbose_eval = verbose_eval + self.model_file = model_file + self.adaptive_training = adaptive_training + + super().__init__() + + # cache measurement input/result pairs and extracted features + self.inputs = [] + self.results = [] + self.last_train_length = 0 + self.inputs_feature_cache = [] + + def update(self, inputs, results): + """Update the cost model according to new measurement results (training data). + XGBoost does not support incremental training, so we re-train a new model every time. + Parameters + ---------- + inputs : List[MeasureInput] + The measurement inputs + results : List[MeasureResult] + The measurement results + """ + if len(inputs) <= 0: + return + assert len(inputs) == len(results) + + self.inputs.extend(inputs) + self.results.extend(results) + + if ( + self.adaptive_training + and len(self.inputs) - self.last_train_length < self.last_train_length / 5 + ): + # Set a training threshold related to `last_train_length` to reduce the training + # overhead when there're too many logs + return + self.last_train_length = len(self.inputs) + + # extract feature + n_cached = len(self.inputs_feature_cache) + features, normalized_throughputs, task_ids = get_per_store_features_from_measure_pairs( + self.inputs, self.results, skip_first_n_feature_extraction=n_cached + ) + if n_cached > 0: + features = list(features) + features[:n_cached] = self.inputs_feature_cache + features = np.array(features, dtype=object) + self.inputs_feature_cache = features + dtrain = pack_sum_xgbmatrix( + features, normalized_throughputs, task_ids, normalized_throughputs + ) + + # train xgb model + self.bst = xgb.train( + self.xgb_params, + dtrain, + num_boost_round=10000, + obj=pack_sum_square_error, + callbacks=[ + CustomCallback( + stopping_rounds=50, + metric="tr-p-rmse", + fevals=[pack_sum_rmse, pack_sum_average_peak_score(self.plan_size)], + evals=[(dtrain, "tr")], + maximize=False, + verbose_eval=self.verbose_eval, + ) + ], + ) + + # Update the model file if it has been set + if self.model_file: + self.save(self.model_file) + + def predict(self, task, states): + """Predict the scores of states + Parameters + ---------- + search_task : SearchTask + The search task of states + statse : List[State] + The input states + Returns + ------- + scores: List[float] + The predicted scores for all states + """ + + # print tasks and states + print("XGBModel: predict") + print("task") + print(type(task)) + task: SearchTask + + + # we will convert the AST into a networkx graph + graph = nx.DiGraph() + + + + ## node visiting using tvm.tir.stmt_functor.post_order_visit + def visit_node(op): + """Split can vectorize the loops found in `find_width8`.""" + return None + + @tvm.tir.transform.prim_func_pass(opt_level=0) + def ast_extractor(f, mod, ctx): + tvm.tir.stmt_functor.post_order_visit(f.body, visit_node) + return f + + for state in states: + # apply the state transformations + schedule, args = task.compute_dag.apply_steps_from_state(state) + schedule: te.Schedule + + # with tvm.transform.PassContext(config={"tir.add_lower_pass": [(3, ast_extractor)]}): + mod: tvm.ir.module.IRModule = tvm.lower(schedule, args) + + ## node visiting using ExprVisitor + visitor = GraphBuilder() + visitor.visit(mod["main"]) + + print('exiting') + exit() + + + features = get_per_store_features_from_states(states, task) + if self.bst is not None and len(self.inputs) > self.num_warmup_sample: + dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features) + raw_preds = self.bst.predict(dtest) + ret = predict_throughput_pack_sum(raw_preds, pack_ids) + else: + ret = np.random.uniform(0, 1, (len(states),)) + + # Predict -inf for invalid states that failed to be lowered. + for idx, feature in enumerate(features): + if feature.min() == feature.max() == 0: + ret[idx] = float("-inf") + + return ret + + def predict_stages(self, task, states): + """Predict the scores of all stages in states. This is the breakdown version of `predict`. + + Parameters + ---------- + search_task : SearchTask + The search task of states + statse : List[State] + The input states + + Returns + ------- + scores: List[float] + The predicted scores for all stages in all states in the packed format + + Note + ---- + For faster data copy between c++ and python, the python part returns scores in a + single flatten array using a packed format. The c++ part then unpacks the flatten array. + The packed format is: + { + + float scores[N]; // scores[i] is the score for states[i]. + int n_stage_0; // the number of stages in states[0] + float stage_scores_0[[n_stage_0] // the scores for all stages in states[0] + int n_stage_1; // the number of stages in states[1] + float stage_scores_1[n_stage_1]; // the scores for all stages in states[1] + ... + int n_stage_i; // the number of stages in states[i] + float stage_scores_1[n_stage_i]; // the scores for all stages in states[i] + ... // untill i == N - 1 + + } + To implement this format, we also store int as float, so we can store all numbers + into a single float array. + """ + features = get_per_store_features_from_states(states, task) + if self.bst is not None and len(self.inputs) > self.num_warmup_sample: + dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features) + raw_preds = self.bst.predict(dtest) + breakdown = predict_throughput_pack_sum(raw_preds, pack_ids) + stage_scores = [[] for _ in range(len(states))] + for pred, pack_id in zip(raw_preds, pack_ids): + stage_scores[pack_id].append(pred) + for idx, stage_score in enumerate(stage_scores): + breakdown = np.append(breakdown, len(stage_score)) + breakdown = np.concatenate((breakdown, np.array(stage_score))) + else: + breakdown = np.concatenate( + (np.random.uniform(0, 1, (len(states),)), np.zeros(len(states))) + ) + + # Predict 0 for invalid states that failed to be lowered. + for idx, feature in enumerate(features): + if feature.min() == feature.max() == 0: + breakdown[idx] = float("-inf") + + return breakdown + + def update_from_file(self, file_name, n_lines=None): + """Load measure records from a log file to update the cost model. + This function can be used to pre-train the cost model with history log files. + Parameters + ---------- + file_name: str + The filename + n_lines: Optional[int] + Only load first n lines of the log file + """ + inputs, results = RecordReader(file_name).read_lines(n_lines) + logger.info("XGBModel: Loaded %s measurement records from %s", len(inputs), file_name) + self.update(inputs, results) + + def save(self, file_name: str): + """Save the model to a file + Parameters + ---------- + file_name: str + The filename + """ + self.bst.save_model(file_name) + + def load(self, file_name: str): + """Load the model from a file + Parameters + ---------- + file_name: str + The filename + """ + if self.bst is None: + self.bst = xgb.Booster(self.xgb_params) + self.bst.load_model(file_name) + self.num_warmup_sample = -1 + + +def feature_to_pack_sum_xgbmatrix(xs): + """Convert an extracted multi-stage feature vector to a xgbmatrx in pack-sum format + Parameters + ---------- + xs: np.ndarray + The feature vector + Returns + ------- + dmatrix: xgb.DMatrix + The DMatrix + pack_ids: List[int] + pack ids information + """ + x_flatten = [] + pack_ids = [] + + for ct, x in enumerate(xs): + for row in x: + x_flatten.append(row) + pack_ids.append(ct) + + return xgb.DMatrix(np.array(x_flatten)), pack_ids + + +def pack_sum_xgbmatrix(xs, ys, gids=None, weights=None): + """Convert (feature, label) pairs into a xgb matrix with pack-sum format + Parameters + ---------- + xs: np.ndarray + The feature vector + ys: np.ndarray + The normaizlied throughput + gids: Optional[List[int]] + Group id (task id) + weights: Optional[np.ndarray] + The weight of samples + Returns + ------- + dmatrix: xgb.DMatrix + The DMatrix with pack-sum information + """ + if gids is not None: + # sort by group + indices = gids.argsort() + xs, ys = xs[indices], ys[indices] + group_sizes = np.bincount(gids) + if weights is not None: + weights = weights[indices] + else: + # assume it has only one group + group_sizes = [len(xs)] + + x_flatten = [] + y_flatten = [] + weights_flatten = [] + pack_ids = [] + + if weights is not None: + for ct, (x, y, w) in enumerate(zip(xs, ys, weights)): + for row in x: + x_flatten.append(row) + y_flatten.append(y) + weights_flatten.append(w) + pack_ids.append(ct) + else: + for ct, (x, y) in enumerate(zip(xs, ys)): + for row in x: + x_flatten.append(row) + y_flatten.append(y) + pack_ids.append(ct) + + ret = xgb.DMatrix(np.array(x_flatten), y_flatten) + if weights is not None: + ret.set_weight(weights_flatten) + dmatrix_context.set("pack_ids", ret, np.array(pack_ids)) + dmatrix_context.set("group_sizes", ret, group_sizes) + return ret + + +def predict_throughput_pack_sum(raw_preds, pack_ids): + """Predict the throughputs for predictions in pack-sum format + Parameters + ---------- + raw_preds: np.ndarray + The raw predictions + pack_ids: List[int] + The pack id for predictions + Returns + ------- + throughputs: np.ndarray + The throughput + """ + sum_pred = np.bincount(pack_ids, weights=raw_preds) + return sum_pred + + +def pack_sum_square_error(preds, dtrain): + """Implement square error loss on pack-sum format as + a custom objective function for xgboost. + Parameters + ---------- + preds: np.ndarray + The predicitons + dtrain: xgb.DMatrix + The training set + Returns + ------- + gradient: np.ndarray + hessian: np.ndarray + gradient and hessian according to the xgboost format + """ + pack_ids = dmatrix_context.get("pack_ids", dtrain) + weight = dtrain.get_weight() + + sum_pred = np.bincount(pack_ids, weights=preds) + x = sum_pred[pack_ids] + y = dtrain.get_label() + gradient = x - y + hessian = np.ones_like(gradient) + + if len(weight) == 0: + return gradient, hessian + + return gradient * weight, hessian * weight + + +def pack_sum_rmse(raw_preds, labels): + """Evaluate RMSE (rooted mean square error) in the pack-sum format + Parameters + ---------- + raw_preds: np.ndarray + The raw prediction + labels: xgb.DMatrix + The groud-truth label matrix + Returns + ------- + name: str + score: float + The name and score of this metric + """ + pack_ids = dmatrix_context.get("pack_ids", labels) + preds = predict_throughput_pack_sum(raw_preds, pack_ids)[pack_ids] + return "p-rmse", np.sqrt(np.mean(np.square((preds - labels.get_label())))) + + +def pack_sum_average_peak_score(N): + """Return the evaluation function for average-peak-score@N + Parameters + ---------- + N: int + The "N" in "average-peak-score@N" + Returns + ------- + The evaluation function + """ + + def feval(preds, labels): + """Evaluate average-peak-score@N in the pack-sum format + Parameters + ---------- + raw_preds: np.ndarray + The raw prediction + labels: xgb.DMatrix + The groud-truth label matrix + Returns + ------- + name: str + score: float + The name and score of this metric + """ + group_sizes = dmatrix_context.get("group_sizes", labels, [len(preds)]) + pack_ids = dmatrix_context.get("pack_ids", labels) + + preds = predict_throughput_pack_sum(preds, pack_ids) + labels = ( + np.bincount(pack_ids, weights=labels.get_label()) + / np.unique(pack_ids, return_counts=True)[1] + ) + + scores = [] + offset = 0 + for size in group_sizes: + preds_group = preds[offset : offset + size] + labels_group = labels[offset : offset + size] + offset += size + + trials = np.argsort(preds_group)[::-1][:N] + trial_scores = labels_group[trials] + curve = max_curve(trial_scores) / np.max(labels_group) + scores.append(np.mean(curve)) + return f"a-peak@{N}", np.mean(scores) + + return feval + + +class XGBoostCallback(TrainingCallback): + """Base class for XGBoost callbacks.""" + + def __call__(self, env: "xgb.core.CallbackEnv"): + # Compatibility with xgboost < 1.3 + return self.after_iteration(env.model, env.iteration, env.evaluation_result_list) + + def after_iteration(self, model: "xgb.Booster", epoch: int, evals_log: Dict): + raise NotImplementedError + + +class CustomCallback(XGBoostCallback): + """ + Callback function for xgboost. + Support custom evaluation function and early-stopping. + """ + + def __init__( + self, + stopping_rounds, + metric, + fevals, + evals=(), + log_file=None, + maximize=False, + verbose_eval=True, + skip_every=2, + ): + """Init function""" + self.stopping_rounds = stopping_rounds + self.metric = metric + self.metric_shortname = metric.split("-")[1] + self.fevals = fevals + self.evals = evals + self.log_file = log_file + self.maximize = maximize + self.verbose_eval = verbose_eval + self.skip_every = skip_every + self.state = {} + + def after_iteration(self, model: "xgb.Booster", epoch: int, evals_log: Dict): + """Run after each iteration. Return True when training should stop.""" + # pylint:disable = import-outside-toplevel + try: + from xgboost.callback import _fmt_metric # type: ignore + except ImportError: + # Compatibility with xgboost >= 1.6 + def _fmt_metric(value, show_stdv=True): + """format metric string""" + if len(value) == 2: + return f"{value[0]}:{value[1]:.5f}" + if len(value) == 3: + if show_stdv: + return f"{value[0]}:{value[1]:.5f}+{value[2]:.5f}" + return f"{value[0]}:{value[1]:.5f}" + raise ValueError("wrong metric value", value) + + ##### init state ##### + if not self.state: + self.state["maximize_score"] = self.maximize + self.state["best_iteration"] = 0 + if self.maximize: + self.state["best_score"] = float("-inf") + else: + self.state["best_score"] = float("inf") + + assert model is not None + if model.attr("best_score") is not None: + self.state["best_score"] = float(model.attr("best_score")) + self.state["best_iteration"] = int(model.attr("best_iteration")) + self.state["best_msg"] = model.attr("best_msg") + else: + model.set_attr(best_iteration=str(self.state["best_iteration"])) + model.set_attr(best_score=str(self.state["best_score"])) + res_dict = {} + + if epoch % self.skip_every == 1: + return False + + ##### evaluation ##### + for feval in self.fevals: + bst_eval = model.eval_set(self.evals, epoch, feval) + res = [x.split(":") for x in bst_eval.split()] + for kv in res[1:]: + res_dict[kv[0]] = [float(kv[1])] + + eval_res = [] + keys = list(res_dict.keys()) + keys.sort(key=lambda x: x if self.metric_shortname not in x else "a" + x) + for key in keys: + v = res_dict[key] + eval_res.append([key] + v) + + ##### print eval result ##### + if ( + not isinstance(self.verbose_eval, bool) + and self.verbose_eval + and epoch % self.verbose_eval == 0 + ): + infos = [f"XGB iter: {epoch:3d}"] + for item in eval_res: + if "null" in item[0]: + continue + infos.append(f"{item[0]}: {item[1]:.6f}") + + logger.debug("\t".join(infos)) + if self.log_file: + with open(self.log_file, "a") as fout: + fout.write("\t".join(infos) + "\n") + + ##### choose score and do early stopping ##### + score = None + for item in eval_res: + if item[0] == self.metric: + score = item[1] + break + assert score is not None + + best_score = self.state["best_score"] + best_iteration = self.state["best_iteration"] + maximize_score = self.state["maximize_score"] + + if (maximize_score and score > best_score) or (not maximize_score and score < best_score): + msg = f"[{epoch}] " + "\t".join([_fmt_metric(x) for x in eval_res]) + self.state["best_msg"] = msg + self.state["best_score"] = score + self.state["best_iteration"] = epoch + # save the property to attributes, so they will occur in checkpoint. + if model is not None: + model.set_attr( + best_score=str(self.state["best_score"]), + best_iteration=str(self.state["best_iteration"]), + best_msg=self.state["best_msg"], + ) + elif epoch - best_iteration >= self.stopping_rounds: + best_msg = self.state["best_msg"] + if self.verbose_eval: + logger.debug("XGB stopped. Best iteration: %s ", best_msg) + return True + + return False From a7ecd8f394d82b4bf735732e9d593691c59e2671 Mon Sep 17 00:00:00 2001 From: Dwijen Chawra Date: Tue, 3 Dec 2024 20:24:16 -0500 Subject: [PATCH 3/8] 1 more namespace addition --- python/tvm/auto_scheduler/cost_model/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tvm/auto_scheduler/cost_model/__init__.py b/python/tvm/auto_scheduler/cost_model/__init__.py index 56e4a5f9128b..3fc767e43860 100644 --- a/python/tvm/auto_scheduler/cost_model/__init__.py +++ b/python/tvm/auto_scheduler/cost_model/__init__.py @@ -19,3 +19,4 @@ from .cost_model import RandomModel from .xgb_model import XGBModel +from .gnn_model import GNNModel From 4600a21037b4a39e5c4c2025c2b40517b92ef759 Mon Sep 17 00:00:00 2001 From: Dwijen Chawra Date: Tue, 3 Dec 2024 23:31:36 -0500 Subject: [PATCH 4/8] extracting ast into graph working --- .../auto_scheduler/cost_model/gnn_model.py | 125 ++++++++++++++---- 1 file changed, 98 insertions(+), 27 deletions(-) diff --git a/python/tvm/auto_scheduler/cost_model/gnn_model.py b/python/tvm/auto_scheduler/cost_model/gnn_model.py index 97b54d429d88..e87a01dea8d1 100644 --- a/python/tvm/auto_scheduler/cost_model/gnn_model.py +++ b/python/tvm/auto_scheduler/cost_model/gnn_model.py @@ -33,7 +33,10 @@ import tvm import networkx as nx import matplotlib.pyplot as plt -from tvm.relay import ExprVisitor +from ...relay.expr_functor import ExprVisitor +import uuid +from ...tir import * +from pyvis.network import Network try: @@ -48,18 +51,40 @@ class TrainingCallback: # type: ignore logger = logging.getLogger("auto_scheduler") -class GraphBuilder(ExprVisitor): - def __init__(self): - super().__init__() - self.graph = nx.DiGraph() # Create a directed graph - self.current_node = None # To keep track of the current node being visited +def extract_attr_stmt_features(node: AttrStmt) -> List[float]: + # Extract features from AttrStmt + return [len(node.attr_key), len(node.value), len(node.body)] + +def extract_int_imm_features(node: IntImm) -> List[float]: + # Extract features from IntImm + return [node.value] + +def extract_allocate_features(node: Allocate) -> List[float]: + # Extract features from Allocate + return [len(node.buffer_var), len(node.dtype), len(node.extents)] + +def extract_seq_stmt_features(node: SeqStmt) -> List[float]: + # Extract features from SeqStmt + return [len(node.seq)] + +def extract_for_features(node: For) -> List[float]: + # Extract features from For + return [len(node.loop_var), node.min.value, node.extent.value, node.kind] + +def extract_buffer_store_features(node: BufferStore) -> List[float]: + # Extract features from BufferStore + return [len(node.buffer), len(node.indices)] + +def extract_float_imm_features(node: FloatImm) -> List[float]: + # Extract features from FloatImm + return [node.value] + +def extract_call_features(node: Call) -> List[float]: + # Extract features from Call + return [len(node.args), len(node.dtype)] - def visit(self, expr): - # to start just print the type of the expression - print(type(expr)) - super().visit(expr) -class GNNCostModel(PythonBasedModel): +class GNNModel(PythonBasedModel): """Train a GNN model that learns from the AST representation of a TIR program and predicts the performance of the program. @@ -201,20 +226,49 @@ def predict(self, task, states): print(type(task)) task: SearchTask - # we will convert the AST into a networkx graph graph = nx.DiGraph() - - + parent_stack = [] + types = [] + + def preorder(node): + current_node_id = graph.number_of_nodes() + current_node_content = str(node) + # Add the current node to the graph if it's not already present + if node not in graph: + if type(node) not in types: + types.append(type(node)) + graph.add_node(str(current_node_id), title=current_node_content) + # If there's a parent, add an edge from the parent to the current node + if parent_stack: + parent = parent_stack[-1] + graph.add_edge(str(current_node_id), str(parent)) + # Push the current node onto the stack + parent_stack.append(str(current_node_id)) + + # Return None to continue recursion + return None - ## node visiting using tvm.tir.stmt_functor.post_order_visit - def visit_node(op): - """Split can vectorize the loops found in `find_width8`.""" + def postorder(node): + # Pop the current node off the stack after processing + if parent_stack: + parent_stack.pop() + + # Return None to continue postorder processing return None @tvm.tir.transform.prim_func_pass(opt_level=0) def ast_extractor(f, mod, ctx): - tvm.tir.stmt_functor.post_order_visit(f.body, visit_node) + # clear the graph + graph.clear() + # clear the parent stack + parent_stack.clear() + + # add in root node to graph and parent stack + graph.add_node("root") + parent_stack.append("root") + + tvm.tir.stmt_functor.ir_transform(f.body, preorder, postorder) return f for state in states: @@ -222,15 +276,32 @@ def ast_extractor(f, mod, ctx): schedule, args = task.compute_dag.apply_steps_from_state(state) schedule: te.Schedule - # with tvm.transform.PassContext(config={"tir.add_lower_pass": [(3, ast_extractor)]}): - mod: tvm.ir.module.IRModule = tvm.lower(schedule, args) - - ## node visiting using ExprVisitor - visitor = GraphBuilder() - visitor.visit(mod["main"]) - - print('exiting') - exit() + with tvm.transform.PassContext(config={"tir.add_lower_pass": [(3, ast_extractor)]}): + mod: tvm.ir.module.IRModule = tvm.lower(schedule, args) + + # print the graph + print(graph) + print(types) + print("LEN TYPES:", len(types)) + # visualize the graph + # nx.draw(graph, with_labels=True) + nt = Network('100%', '100%', directed=True, notebook=False) + nt.show_buttons(filter_=['physics']) + + nt.options.physics.use_repulsion = True + + nt.from_nx(graph) + nt.show("graph" + str(uuid.uuid4()) + ".html") + + # plt.savefig("graph" + str(uuid.uuid4()) + ".png") + # clear the graph + graph.clear() + # clear the parent stack + parent_stack.clear() + types.clear() + + print('exiting') + exit(-1) features = get_per_store_features_from_states(states, task) From 1ab4822a324bb12e6020c29214b5c85b67daf488 Mon Sep 17 00:00:00 2001 From: Dwijen Chawra Date: Wed, 4 Dec 2024 01:15:35 -0500 Subject: [PATCH 5/8] tvm objects not serializable :( --- .../auto_scheduler/cost_model/gnn_model.py | 228 +++++++++--------- 1 file changed, 111 insertions(+), 117 deletions(-) diff --git a/python/tvm/auto_scheduler/cost_model/gnn_model.py b/python/tvm/auto_scheduler/cost_model/gnn_model.py index e87a01dea8d1..b3b9b208868e 100644 --- a/python/tvm/auto_scheduler/cost_model/gnn_model.py +++ b/python/tvm/auto_scheduler/cost_model/gnn_model.py @@ -19,8 +19,10 @@ """Cost model based on xgboost""" import multiprocessing import logging -from typing import Dict +import multiprocessing.pool +from typing import Dict, List, Tuple from collections import defaultdict +import time import numpy as np @@ -34,10 +36,13 @@ import networkx as nx import matplotlib.pyplot as plt from ...relay.expr_functor import ExprVisitor +from ..search_task import SearchTask +from ..loop_state import State import uuid from ...tir import * from pyvis.network import Network - +from concurrent.futures import ProcessPoolExecutor +from pathos.multiprocessing import ProcessingPool try: from xgboost.callback import TrainingCallback # type: ignore @@ -51,38 +56,80 @@ class TrainingCallback: # type: ignore logger = logging.getLogger("auto_scheduler") -def extract_attr_stmt_features(node: AttrStmt) -> List[float]: - # Extract features from AttrStmt - return [len(node.attr_key), len(node.value), len(node.body)] - -def extract_int_imm_features(node: IntImm) -> List[float]: - # Extract features from IntImm - return [node.value] - -def extract_allocate_features(node: Allocate) -> List[float]: - # Extract features from Allocate - return [len(node.buffer_var), len(node.dtype), len(node.extents)] - -def extract_seq_stmt_features(node: SeqStmt) -> List[float]: - # Extract features from SeqStmt - return [len(node.seq)] - -def extract_for_features(node: For) -> List[float]: - # Extract features from For - return [len(node.loop_var), node.min.value, node.extent.value, node.kind] - -def extract_buffer_store_features(node: BufferStore) -> List[float]: - # Extract features from BufferStore - return [len(node.buffer), len(node.indices)] - -def extract_float_imm_features(node: FloatImm) -> List[float]: - # Extract features from FloatImm - return [node.value] - -def extract_call_features(node: Call) -> List[float]: - # Extract features from Call - return [len(node.args), len(node.dtype)] - +def vizgraph(graph: nx.DiGraph): + nt = Network('100%', '100%', directed=True, notebook=False) + nt.show_buttons(filter_=['physics']) + nt.options.physics.use_repulsion = True + nt.from_nx(graph) + nt.show("graph" + str(uuid.uuid4()) + ".html") + plt.savefig("graph" + str(uuid.uuid4()) + ".png") + +def node2vec(): + pass + +def gnn_feature_extractor_tup(task_state: Tuple): + gnn_feature_extractor(task_state[0], task_state[1]) + +def gnn_feature_extractor(task: SearchTask, state: State): + graph = nx.DiGraph() + parent_stack = [] + types = [] + + def preorder(node): + current_node_id = graph.number_of_nodes() + current_node_content = str(node) + # Add the current node to the graph if it's not already present + if node not in graph: + if type(node) not in types: + types.append(type(node)) + graph.add_node(str(current_node_id), title=current_node_content) + # If there's a parent, add an edge from the parent to the current node + if parent_stack: + parent = parent_stack[-1] + graph.add_edge(str(current_node_id), str(parent)) + # Push the current node onto the stack + parent_stack.append(str(current_node_id)) + + # Return None to continue recursion + return None + + def postorder(node): + # Pop the current node off the stack after processing + if parent_stack: + parent_stack.pop() + + # Return None to continue postorder processing + return None + + @tvm.tir.transform.prim_func_pass(opt_level=3) + def ast_extractor(f, mod, ctx): + # clear the graph + graph.clear() + # clear the parent stack + parent_stack.clear() + + # add in root node to graph and parent stack + graph.add_node("root") + parent_stack.append("root") + + tvm.tir.stmt_functor.ir_transform(f.body, preorder, postorder) + return f + + # apply the state transformations + schedule, args = task.compute_dag.apply_steps_from_state(state) + schedule: te.Schedule + + with tvm.transform.PassContext(config={"tir.add_lower_pass": [(3, ast_extractor)]}): + mod: tvm.ir.module.IRModule = tvm.lower(schedule, args) + +def get_gnn_features(task: SearchTask, states: List[State]): + # parallel process all the states + args = list(zip([task]*len(states), states)) + + with ProcessingPool() as pool: + features = list(pool.map(gnn_feature_extractor_tup, args)) + + return features class GNNModel(PythonBasedModel): """Train a GNN model that learns from the AST representation of a TIR program @@ -135,6 +182,14 @@ def __init__( self.verbose_eval = verbose_eval self.model_file = model_file self.adaptive_training = adaptive_training + + self.predictcounter = 0 + self.predictstagecounter = 0 + self.updatecounter = 0 + + self.average_pred_time = 0 + self.average_pred_count = 0 + super().__init__() @@ -145,12 +200,18 @@ def __init__( self.inputs_feature_cache = [] def update(self, inputs, results): + self.updatecounter += len(inputs) + print("Update ", self.updatecounter) + """Update the cost model according to new measurement results (training data). XGBoost does not support incremental training, so we re-train a new model every time. Parameters ---------- inputs : List[MeasureInput] The measurement inputs + **** this containts + - task + - state results : List[MeasureResult] The measurement results """ @@ -207,6 +268,9 @@ def update(self, inputs, results): self.save(self.model_file) def predict(self, task, states): + self.predictcounter += len(states) + print("Predict ", self.predictcounter) + """Predict the scores of states Parameters ---------- @@ -220,91 +284,17 @@ def predict(self, task, states): The predicted scores for all states """ - # print tasks and states - print("XGBModel: predict") - print("task") - print(type(task)) - task: SearchTask - - # we will convert the AST into a networkx graph - graph = nx.DiGraph() - parent_stack = [] - types = [] - - def preorder(node): - current_node_id = graph.number_of_nodes() - current_node_content = str(node) - # Add the current node to the graph if it's not already present - if node not in graph: - if type(node) not in types: - types.append(type(node)) - graph.add_node(str(current_node_id), title=current_node_content) - # If there's a parent, add an edge from the parent to the current node - if parent_stack: - parent = parent_stack[-1] - graph.add_edge(str(current_node_id), str(parent)) - # Push the current node onto the stack - parent_stack.append(str(current_node_id)) - - # Return None to continue recursion - return None - - def postorder(node): - # Pop the current node off the stack after processing - if parent_stack: - parent_stack.pop() - - # Return None to continue postorder processing - return None - - @tvm.tir.transform.prim_func_pass(opt_level=0) - def ast_extractor(f, mod, ctx): - # clear the graph - graph.clear() - # clear the parent stack - parent_stack.clear() - - # add in root node to graph and parent stack - graph.add_node("root") - parent_stack.append("root") - - tvm.tir.stmt_functor.ir_transform(f.body, preorder, postorder) - return f - - for state in states: - # apply the state transformations - schedule, args = task.compute_dag.apply_steps_from_state(state) - schedule: te.Schedule - - with tvm.transform.PassContext(config={"tir.add_lower_pass": [(3, ast_extractor)]}): - mod: tvm.ir.module.IRModule = tvm.lower(schedule, args) - - # print the graph - print(graph) - print(types) - print("LEN TYPES:", len(types)) - # visualize the graph - # nx.draw(graph, with_labels=True) - nt = Network('100%', '100%', directed=True, notebook=False) - nt.show_buttons(filter_=['physics']) - - nt.options.physics.use_repulsion = True - - nt.from_nx(graph) - nt.show("graph" + str(uuid.uuid4()) + ".html") - - # plt.savefig("graph" + str(uuid.uuid4()) + ".png") - # clear the graph - graph.clear() - # clear the parent stack - parent_stack.clear() - types.clear() - - print('exiting') - exit(-1) - + # Timing the first function call + start_time_gnn = time.time() + features = get_gnn_features(task, states) + end_time_gnn = time.time() + print(f"Time taken for get_gnn_features: {end_time_gnn - start_time_gnn:.6f} seconds") + # Timing the second function call + start_time_per_store = time.time() features = get_per_store_features_from_states(states, task) + end_time_per_store = time.time() + print(f"Time taken for get_per_store_features_from_states: {end_time_per_store - start_time_per_store:.6f} seconds") if self.bst is not None and len(self.inputs) > self.num_warmup_sample: dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features) raw_preds = self.bst.predict(dtest) @@ -320,6 +310,10 @@ def ast_extractor(f, mod, ctx): return ret def predict_stages(self, task, states): + self.predictstagecounter += len(states) + print("Predict stage ", self.predictstagecounter) + + """Predict the scores of all stages in states. This is the breakdown version of `predict`. Parameters From 6370f18393cfe05cd517ff07ef4db44c6d40bb93 Mon Sep 17 00:00:00 2001 From: Dwijen Chawra Date: Wed, 4 Dec 2024 02:36:09 -0500 Subject: [PATCH 6/8] parallel lowering and ast generation in python --- .../auto_scheduler/cost_model/gnn_model.py | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/python/tvm/auto_scheduler/cost_model/gnn_model.py b/python/tvm/auto_scheduler/cost_model/gnn_model.py index b3b9b208868e..cb256d3f0f5f 100644 --- a/python/tvm/auto_scheduler/cost_model/gnn_model.py +++ b/python/tvm/auto_scheduler/cost_model/gnn_model.py @@ -19,13 +19,16 @@ """Cost model based on xgboost""" import multiprocessing import logging +import multiprocessing.context import multiprocessing.pool from typing import Dict, List, Tuple from collections import defaultdict import time +import json import numpy as np +import tvm.auto_scheduler from tvm.autotvm.tuner.metric import max_curve from .cost_model import PythonBasedModel from ..feature import get_per_store_features_from_measure_pairs, get_per_store_features_from_states @@ -39,10 +42,9 @@ from ..search_task import SearchTask from ..loop_state import State import uuid -from ...tir import * + +from ..measure import MeasureInput from pyvis.network import Network -from concurrent.futures import ProcessPoolExecutor -from pathos.multiprocessing import ProcessingPool try: from xgboost.callback import TrainingCallback # type: ignore @@ -67,8 +69,10 @@ def vizgraph(graph: nx.DiGraph): def node2vec(): pass -def gnn_feature_extractor_tup(task_state: Tuple): - gnn_feature_extractor(task_state[0], task_state[1]) +def gnn_feature_extractor_tup(ser): + minput: MeasureInput = MeasureInput.deserialize(ser) + + gnn_feature_extractor(minput.task, minput.state) def gnn_feature_extractor(task: SearchTask, state: State): graph = nx.DiGraph() @@ -124,10 +128,16 @@ def ast_extractor(f, mod, ctx): def get_gnn_features(task: SearchTask, states: List[State]): # parallel process all the states - args = list(zip([task]*len(states), states)) + args = list(zip([(task)]*len(states), states)) + + # make measureinputs + inputs = [MeasureInput(task, s).serialize() for s in states] - with ProcessingPool() as pool: - features = list(pool.map(gnn_feature_extractor_tup, args)) + + # ctx = multiprocessing.get_context('fork') + + with multiprocessing.pool.Pool(multiprocessing.cpu_count()) as executor: + features = list(executor.map(gnn_feature_extractor_tup, inputs)) return features From 8214ebf6bdaf7f31019cf8c7ecb061d2fc720569 Mon Sep 17 00:00:00 2001 From: Dwijen Chawra Date: Wed, 4 Dec 2024 02:41:48 -0500 Subject: [PATCH 7/8] remove unused files --- conv2dout_lowered.txt | 50145 ---------------------------------------- llpass.py | 162 - lowered_tir.py | 481 - 3 files changed, 50788 deletions(-) delete mode 100644 conv2dout_lowered.txt delete mode 100644 llpass.py delete mode 100644 lowered_tir.py diff --git a/conv2dout_lowered.txt b/conv2dout_lowered.txt deleted file mode 100644 index f2a1c453f8c5..000000000000 --- a/conv2dout_lowered.txt +++ /dev/null @@ -1,50145 +0,0 @@ ----------------------------------------------------------------------- ------------------------------- [ Search ] ----------------------------------------------------------------------- -Generate Sketches #s: 1 -Sample Initial Population #s: 66 fail_ct: 1982 Time elapsed: 2.01 -GA Iter: 0 Max score: 0.9867 Min score: 0.0250 #Pop: 66 #M+: 0 #M-: 0 -GA Iter: 4 Max score: 0.9997 Min score: 0.9788 #Pop: 128 #M+: 1390 #M-: 0 -EvolutionarySearch #s: 128 Time elapsed: 30.83 ----------------------------------------------------------------------- ------------------------------- [ Measure ] ----------------------------------------------------------------------- -Get 10 programs to measure: -..........**********================================================== -No: 1 GFLOPS: 40.51 / 40.51 results: MeasureResult(cost:[0.0057], error_no:0, all_cost:1.67, Tstamp:1732601215.31) -================================================= -Placeholder: data, kernel, bias -blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,8) - vthread i0.1@i1.1@i2.1@i3.1@ (0,2) - threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,16) - for rc.0 (0,32) - for ry.0 (0,3) - for ax0@ax1@ax2@ax3@.0.0 (0,192) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,16) - kernel.shared = ... - for ax0@ax1@ax2@ax3@.0.0 (0,3) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,16) - vectorize ax0@ax1@ax2@ax3@.1 (0,24) - pad_temp.shared = ... - for rc.1 (0,16) - for yy.3 (0,7) - for xx.3 (0,7) - for rx.2 (0,3) - for ff.4 (0,2) - conv2d_nchw = ... - for i1.3 (0,2) - for i2.3 (0,7) - for i3.3 (0,7) - compute = ... - -================================================== -No: 2 GFLOPS: 356.02 / 356.02 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:0.95, Tstamp:1732601215.87) -================================================== -Placeholder: data, kernel, bias -blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,4) - vthread i0.1@i1.1@i2.1@i3.1@ (0,7) - threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,128) - for rc.0 (0,512) - for ax0@ax1@ax2@ax3@.0.0 (0,3) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) - vectorize ax0@ax1@ax2@ax3@.1 (0,3) - kernel.shared = ... - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) - pad_temp.shared = ... - for ry.2 (0,3) - for rx.2 (0,3) - for xx.4 (0,7) - conv2d_nchw = ... - for i3.3 (0,7) - compute = ... - -================================================== -No: 3 GFLOPS: 505.45 / 505.45 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:11.00, Tstamp:1732601226.47) -================================================== -Placeholder: data, kernel, bias -blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,16) - threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,392) - conv2d_nchw auto_unroll: 16 - for rc.0 (0,16) - for ax0@ax1@ax2@ax3@.0.0 (0,24) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,392) - kernel.shared = ... - for ax0@ax1@ax2@ax3@.0.0 (0,7) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,392) - pad_temp.shared = ... - for rc.1 (0,32) - for rx.1 (0,3) - for ff.3 (0,4) - for ry.2 (0,3) - conv2d_nchw = ... - for i1.3 (0,4) - compute = ... - -================================================== -No: 4 GFLOPS: 218.94 / 505.45 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.55, Tstamp:1732601227.51) -================================================== -Placeholder: data, kernel, bias -blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,16) - vthread i0.1@i1.1@i2.1@i3.1@ (0,7) - threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,56) - conv2d_nchw auto_unroll: 64 - for rc.0 (0,32) - for ax0@ax1@ax2@ax3@.0.0 (0,83) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,56) - kernel.shared = ... - for ax0@ax1@ax2@ax3@.0.0 (0,24) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,56) - pad_temp.shared = ... - for rx.1 (0,3) - for ff.3 (0,4) - for rc.2 (0,16) - for ry.2 (0,3) - conv2d_nchw = ... - for i1.3 (0,4) - compute = ... - -================================================== -No: 5 GFLOPS: 259.70 / 505.45 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:0.95, Tstamp:1732601228.07) -================================================== -Placeholder: data, kernel, bias -blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,8) - vthread i0.1@i1.1@i2.1@i3.1@ (0,7) - threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,112) - for rc.0 (0,32) - for ry.0 (0,3) - for ax0@ax1@ax2@ax3@.0.0 (0,28) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,112) - kernel.shared = ... - for ax0@ax1@ax2@ax3@.0.0 (0,9) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,112) - pad_temp.shared = ... - for rc.1 (0,16) - for rx.2 (0,3) - for ff.4 (0,4) - conv2d_nchw = ... - for i1.3 (0,4) - compute = ... - -================================================== -No: 6 GFLOPS: 349.91 / 505.45 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:0.93, Tstamp:1732601228.72) -================================================== -Placeholder: data, kernel, bias -blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,8) - threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,448) - conv2d_nchw auto_unroll: 64 - for rc.0 (0,512) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,448) - vectorize ax0@ax1@ax2@ax3@.1 (0,3) - kernel.shared = ... - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,448) - pad_temp.shared = ... - for ry.2 (0,3) - for rx.2 (0,3) - for yy.4 (0,7) - conv2d_nchw = ... - for i2.3 (0,7) - compute = ... - -================================================== -No: 7 GFLOPS: 189.28 / 505.45 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:11.13, Tstamp:1732601239.33) -================================================== -Placeholder: data, kernel, bias -blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,4) - vthread i0.1@i1.1@i2.1@i3.1@ (0,7) - threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,64) - conv2d_nchw auto_unroll: 64 - for rc.0 (0,64) - for ax0@ax1@ax2@ax3@.0.0 (0,144) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,64) - kernel.shared = ... - for ax0@ax1@ax2@ax3@.0.0 (0,11) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,64) - pad_temp.shared = ... - for rc.1 (0,8) - for ff.3 (0,2) - for ry.2 (0,3) - for rx.2 (0,3) - for yy.4 (0,7) - conv2d_nchw = ... - for i1.3 (0,2) - for i2.3 (0,7) - compute = ... - -================================================== -No: 8 GFLOPS: 211.81 / 505.45 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:10.46, Tstamp:1732601249.41) -================================================== -Placeholder: data, kernel, bias -blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,14) - threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,128) - for rc.0 (0,256) - for ry.0 (0,3) - for rx.0 (0,3) - for ax0@ax1@ax2@ax3@.0.0 (0,4) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) - kernel.shared = ... - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) - pad_temp.shared = ... - for rc.1 (0,2) - for ff.4 (0,2) - for yy.4 (0,7) - conv2d_nchw = ... - for i1.3 (0,2) - for i2.3 (0,7) - compute = ... - -================================================== -No: 9 GFLOPS: 106.17 / 505.45 results: MeasureResult(cost:[0.0022], error_no:0, all_cost:0.95, Tstamp:1732601249.98) -================================================== -Placeholder: data, kernel, bias -blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,28) - vthread i0.1@i1.1@i2.1@i3.1@ (0,4) - threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,56) - conv2d_nchw auto_unroll: 16 - for rc.0 (0,256) - for ax0@ax1@ax2@ax3@.0.0 (0,42) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,56) - kernel.shared = ... - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,56) - pad_temp.shared = ... - for ry.1 (0,3) - for rx.1 (0,3) - for ff.3 (0,2) - for rc.2 (0,2) - for ff.4 (0,2) - conv2d_nchw = ... - for i1.3 (0,4) - compute = ... - -================================================== -No: 10 GFLOPS: 315.68 / 505.45 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.12, Tstamp:1732601250.54) -================================================== -Placeholder: data, kernel, bias -blockIdx.x i0.0@i1.0@i2.0@i3.0@ (0,14) - threadIdx.x i0.2@i1.2@i2.2@i3.2@ (0,128) - conv2d_nchw auto_unroll: 1024 - for rc.0 (0,128) - for ry.0 (0,3) - for rx.0 (0,3) - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) - vectorize ax0@ax1@ax2@ax3@.1 (0,9) - kernel.shared = ... - threadIdx.x ax0@ax1@ax2@ax3@.0.1 (0,128) - pad_temp.shared = ... - for rc.2 (0,4) - for ff.4 (0,2) - for yy.4 (0,7) - conv2d_nchw = ... - for i1.3 (0,2) - for i2.3 (0,7) - compute = ... - -Time elapsed for measurement: 38.33 s ----------------------------------------------------------------------- ------------------------------- [ Done ] ----------------------------------------------------------------------- -Computational DAG: -data = PLACEHOLDER [1, 512, 7, 7] -pad_temp(i0, i1, i2, i3) = tir.if_then_else(((((i2 >= 1) && (i2 < 8)) && (i3 >= 1)) && (i3 < 8)), data[i0, i1, (i2 - 1), (i3 - 1)], 0f) -kernel = PLACEHOLDER [512, 512, 3, 3] -conv2d_nchw(nn, ff, yy, xx) += (pad_temp[nn, rc, (yy + ry), (xx + rx)]*kernel[ff, rc, ry, rx]) -bias = PLACEHOLDER [1, 512, 1, 1] -T_add(ax0, ax1, ax2, ax3) = (conv2d_nchw[ax0, ax1, ax2, ax3] + bias[ax0, ax1, 0, 0]) -compute(i0, i1, i2, i3) = max(T_add[i0, i1, i2, i3], 0f) - -Get devices for measurement successfully! - -Phase 0 --------------------- -0 - -1 - -0 - -512 - -0 - -7 - -0 - -7 - -T.bool(True) - -64 - -1 - -49 - -0 - -1 - -blockIdx_x - -8 - -blockIdx_x * 8 - -8 - -threadIdx_x - -7 - -threadIdx_x // 7 - -1 - -7 - -threadIdx_x % 7 - -1 - -T.bool(True) - -1024 - -1 - -0 - -2 - -0 - -4 - -T.float32(0.0) - -0 - -ff_inner_init - -ff_outer_inner_init - -ff_outer_inner_init * 4 - -ff_inner_init + ff_outer_inner_init * 4 - -ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8 - -conv2d_nchw = T.Buffer((1, 512, 7, 7)) -ff_inner_init = T.int32() -ff_outer_inner_init = T.int32() -blockIdx_x = T.int32() -threadIdx_x = T.int32() -conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - -for ff_inner_init in range(4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - ff_outer_inner_init = T.int32() - blockIdx_x = T.int32() - threadIdx_x = T.int32() - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - -for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - blockIdx_x = T.int32() - threadIdx_x = T.int32() - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - -0 - -8 - -0 - -3 - -0 - -1 - -rc_outer_outer - -64 - -rc_outer_outer * 64 - -64 - -0 - -9 - -rx_outer_outer - -7 - -T.bool(True) - -0 - -83 - -49 - -threadIdx_x - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 - -threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1 - -T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1) - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64 - -T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64) - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7 - -T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7) - -64 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64 - -T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64) - -576 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576 - -T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576) - -4032 - -threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032 - -T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032) - -4032 - -threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032 - -T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032) - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - -1 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 - -8 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 - -1 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 - -8 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8 - -1 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1 - -1 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1 - -data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1] - -T.float32(0.0) - -T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((1, 512, 7, 7)) -rc_outer_outer = T.int32() -pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((1, 512, 7, 7)) - rc_outer_outer = T.int32() - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((1, 512, 7, 7)) - rc_outer_outer = T.int32() - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((1, 512, 7, 7)) - rc_outer_outer = T.int32() - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((1, 512, 7, 7)) - rc_outer_outer = T.int32() - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -rx_outer_outer = T.int32() -if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - data = T.Buffer((1, 512, 7, 7)) - rc_outer_outer = T.int32() - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -rc_outer_outer = T.int32() -if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - rx_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - rc_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - rx_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - rc_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - rx_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - rc_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - rx_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - -8 - -blockIdx_x * 8 - -8 - -64 - -rc_outer_outer * 64 - -64 - -0 - -3 - -1 - -T.bool(True) - -0 - -32 - -49 - -threadIdx_x - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 - -threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8 - -T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8) - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64 - -T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64) - -rx_outer_outer - rx_outer_outer - -rx_outer_outer - rx_outer_outer < 1 - -T.likely(rx_outer_outer - rx_outer_outer < 1) - -512 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512 - -T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512) - -1536 - -threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536 - -T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536) - -1536 - -threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536 - -T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536) - -1536 - -threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536 - -T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536) - -512 - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512 - -T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512) - -(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3 - -kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") -kernel = T.Buffer((512, 512, 3, 3)) -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -blockIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -blockIdx_x = T.int32() -if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - kernel = T.Buffer((512, 512, 3, 3)) - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - blockIdx_x = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - kernel = T.Buffer((512, 512, 3, 3)) - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - blockIdx_x = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - kernel = T.Buffer((512, 512, 3, 3)) - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - blockIdx_x = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - kernel = T.Buffer((512, 512, 3, 3)) - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - blockIdx_x = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - kernel = T.Buffer((512, 512, 3, 3)) - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -rx_outer_outer = T.int32() -if T.likely(rx_outer_outer - rx_outer_outer < 1): - threadIdx_x = T.int32() - ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - blockIdx_x = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - kernel = T.Buffer((512, 512, 3, 3)) - rc_outer_outer = T.int32() - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -rc_outer_outer = T.int32() -if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - rx_outer_outer = T.int32() - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - blockIdx_x = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -threadIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -blockIdx_x = T.int32() -if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - rc_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - rx_outer_outer = T.int32() - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() - blockIdx_x = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - rc_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - rx_outer_outer = T.int32() - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - blockIdx_x = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - rc_outer_outer = T.int32() - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - rx_outer_outer = T.int32() - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - -0 - -8 - -0 - -3 - -0 - -0 - -8 - -0 - -0 - -ff_inner - -ff_outer_inner - -ff_outer_inner * 4 - -ff_inner + ff_outer_inner * 4 - -ff_inner + ff_outer_inner * 4 + blockIdx_x * 8 - -conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] - -rc_inner - -rc_outer_inner - -rc_outer_outer * 8 - -rc_outer_inner + rc_outer_outer * 8 - -(rc_outer_inner + rc_outer_outer * 8) * 8 - -rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8 - -ry_outer_inner - -threadIdx_x // 7 + ry_outer_inner - -threadIdx_x % 7 + rx_outer_outer - -pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] - -kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -conv2d_nchw = T.Buffer((1, 512, 7, 7)) -ff_inner = T.int32() -ff_outer_inner = T.int32() -blockIdx_x = T.int32() -threadIdx_x = T.int32() -pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") -rc_inner = T.int32() -rc_outer_inner = T.int32() -rc_outer_outer = T.int32() -ry_outer_inner = T.int32() -rx_outer_outer = T.int32() -kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") -conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -for ff_inner in range(4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - ff_outer_inner = T.int32() - blockIdx_x = T.int32() - threadIdx_x = T.int32() - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - rc_inner = T.int32() - rc_outer_inner = T.int32() - rc_outer_outer = T.int32() - ry_outer_inner = T.int32() - rx_outer_outer = T.int32() - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -for rc_inner, ff_inner in T.grid(8, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - ff_outer_inner = T.int32() - blockIdx_x = T.int32() - threadIdx_x = T.int32() - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - rc_outer_inner = T.int32() - rc_outer_outer = T.int32() - ry_outer_inner = T.int32() - rx_outer_outer = T.int32() - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -for ff_outer_inner, rc_inner, ff_inner in T.grid(2, 8, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - blockIdx_x = T.int32() - threadIdx_x = T.int32() - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - rc_outer_inner = T.int32() - rc_outer_outer = T.int32() - ry_outer_inner = T.int32() - rx_outer_outer = T.int32() - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -for ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(3, 2, 8, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - blockIdx_x = T.int32() - threadIdx_x = T.int32() - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - rc_outer_inner = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - blockIdx_x = T.int32() - threadIdx_x = T.int32() - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -blockIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") -for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] -for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - threadIdx_x = T.int32() - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") -blockIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -with T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]): - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - threadIdx_x = T.int32() - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") -for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) -kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") -blockIdx_x = T.int32() -T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) -for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] -for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - threadIdx_x = T.int32() - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -with T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]): - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - blockIdx_x = T.int32() - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - threadIdx_x = T.int32() - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -for rx_outer_outer in range(3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - rc_outer_outer = T.int32() - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - blockIdx_x = T.int32() - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - threadIdx_x = T.int32() - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - blockIdx_x = T.int32() - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - threadIdx_x = T.int32() - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -conv2d_nchw = T.Buffer((1, 512, 7, 7)) -blockIdx_x = T.int32() -for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - threadIdx_x = T.int32() - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) -threadIdx_x = T.int32() -for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -nn_outer_outer_outer_outer = T.int32() -with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1): - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - blockIdx_x = T.int32() - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - threadIdx_x = T.int32() - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - threadIdx_x = T.int32() - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -nn_outer_outer_outer_outer = T.int32() -with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - blockIdx_x = T.int32() - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - threadIdx_x = T.int32() - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - threadIdx_x = T.int32() - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - -0 - -8 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 // 7 - -vthread - -64 - -blockIdx_x // 64 - -vthread + blockIdx_x // 64 - -threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64) - -i1_inner - -blockIdx_x % 64 - -blockIdx_x % 64 * 8 - -i1_inner + blockIdx_x % 64 * 8 - -threadIdx_x // 7 % 7 - -threadIdx_x % 7 - -conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] - -0 - -0 - -bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0] - -conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0] - -T.float32(0.0) - -T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) - -compute = T.Buffer((1, 512, 7, 7)) -conv2d_nchw = T.Buffer((1, 512, 7, 7)) -threadIdx_x = T.int32() -vthread = T.int32() -blockIdx_x = T.int32() -i1_inner = T.int32() -bias = T.Buffer((1, 512, 1, 1)) -compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) - -for i1_inner in range(8): - compute = T.Buffer((1, 512, 7, 7)) - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - threadIdx_x = T.int32() - vthread = T.int32() - blockIdx_x = T.int32() - bias = T.Buffer((1, 512, 1, 1)) - compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) - -nn_outer_outer_outer_outer = T.int32() -conv2d_nchw = T.Buffer((1, 512, 7, 7)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] -for i1_inner in range(8): - compute = T.Buffer((1, 512, 7, 7)) - vthread = T.int32() - bias = T.Buffer((1, 512, 1, 1)) - compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) - -conv2d_nchw = T.Buffer((1, 512, 7, 7)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -with T.realize(conv2d_nchw[0:1, blockIdx_x * 8:blockIdx_x * 8 + 8, threadIdx_x // 7:threadIdx_x // 7 + 1, threadIdx_x % 7:threadIdx_x % 7 + 1]): - nn_outer_outer_outer_outer = T.int32() - with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - for i1_inner in range(8): - compute = T.Buffer((1, 512, 7, 7)) - vthread = T.int32() - bias = T.Buffer((1, 512, 1, 1)) - compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - blockIdx_x = T.int32() - T.realize(conv2d_nchw[0:1, blockIdx_x * 8:blockIdx_x * 8 + 8, threadIdx_x // 7:threadIdx_x // 7 + 1, threadIdx_x % 7:threadIdx_x % 7 + 1]) - nn_outer_outer_outer_outer = T.int32() - with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - for i1_inner in range(8): - compute = T.Buffer((1, 512, 7, 7)) - vthread = T.int32() - bias = T.Buffer((1, 512, 1, 1)) - compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) - -with T.launch_thread("vthread", 1) as vthread: - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - blockIdx_x = T.int32() - T.realize(conv2d_nchw[0:1, blockIdx_x * 8:blockIdx_x * 8 + 8, threadIdx_x // 7:threadIdx_x // 7 + 1, threadIdx_x % 7:threadIdx_x % 7 + 1]) - nn_outer_outer_outer_outer = T.int32() - with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - for i1_inner in range(8): - compute = T.Buffer((1, 512, 7, 7)) - bias = T.Buffer((1, 512, 1, 1)) - compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) - -with T.launch_thread("blockIdx.x", 64) as blockIdx_x: - vthread = T.launch_thread("vthread", 1) - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - T.realize(conv2d_nchw[0:1, blockIdx_x * 8:blockIdx_x * 8 + 8, threadIdx_x // 7:threadIdx_x // 7 + 1, threadIdx_x % 7:threadIdx_x % 7 + 1]) - nn_outer_outer_outer_outer = T.int32() - with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - for i1_inner in range(8): - compute = T.Buffer((1, 512, 7, 7)) - bias = T.Buffer((1, 512, 1, 1)) - compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) - -compute = T.Buffer((1, 512, 7, 7)) -with T.realize(compute[0:1, 0:512, 0:7, 0:7]): - blockIdx_x = T.launch_thread("blockIdx.x", 64) - vthread = T.launch_thread("vthread", 1) - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw = T.Buffer((1, 512, 7, 7)) - T.realize(conv2d_nchw[0:1, blockIdx_x * 8:blockIdx_x * 8 + 8, threadIdx_x // 7:threadIdx_x // 7 + 1, threadIdx_x % 7:threadIdx_x % 7 + 1]) - nn_outer_outer_outer_outer = T.int32() - with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw[0, ff_inner_init + ff_outer_inner_init * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.Buffer((1, 512, 9, 9), scope="shared") - T.realize(pad_temp_shared[0:1, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:9, rx_outer_outer:rx_outer_outer + 7]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64 < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - rx_outer_outer < 7): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 < 64): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 < 576): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 4032): - data = T.Buffer((1, 512, 7, 7)) - pad_temp_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer] = T.if_then_else((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 < 8 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer >= 1 and (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer < 8, data[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 // 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 // 9 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 7 % 9 - 1, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 7 + rx_outer_outer - 1], T.float32(0.0)) - kernel_shared = T.Buffer((512, 512, 3, 3), scope="shared") - T.realize(kernel_shared[blockIdx_x * 8:blockIdx_x * 8 + 8, rc_outer_outer * 64:rc_outer_outer * 64 + 64, 0:3, rx_outer_outer:rx_outer_outer + 1]) - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 - blockIdx_x * 8 < 8): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64 - rc_outer_outer * 64 < 64): - if T.likely(rx_outer_outer - rx_outer_outer < 1): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 < 512): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 < 1536): - if T.likely((threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8 < 512): - kernel = T.Buffer((512, 512, 3, 3)) - kernel_shared[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] = kernel[(threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 // 64 + blockIdx_x * 8, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) // 3 % 64 + rc_outer_outer * 64, (threadIdx_x_1 + ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) % 3, rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] = conv2d_nchw[0, ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, threadIdx_x // 7, threadIdx_x % 7] + pad_temp_shared[0, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, threadIdx_x // 7 + ry_outer_inner, threadIdx_x % 7 + rx_outer_outer] * kernel_shared[ff_inner + ff_outer_inner * 4 + blockIdx_x * 8, rc_inner + (rc_outer_inner + rc_outer_outer * 8) * 8, ry_outer_inner, rx_outer_outer] - for i1_inner in range(8): - bias = T.Buffer((1, 512, 1, 1)) - compute[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] = T.max(conv2d_nchw[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, threadIdx_x // 7 % 7, threadIdx_x % 7] + bias[threadIdx_x // 7 // 7 + (vthread + blockIdx_x // 64), i1_inner + blockIdx_x % 64 * 8, 0, 0], T.float32(0.0)) - -# from tvm.script import ir as I -# from tvm.script import tir as T - -@I.ir_module -class Module: - @T.prim_func - def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")): - T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)}) - blockIdx_x = T.launch_thread("blockIdx.x", 64) - conv2d_nchw = T.allocate([8], "float32", "local") - pad_temp_shared = T.allocate([4032], "float32", "shared") - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - conv2d_nchw_1[0] = T.float32(0.0) - conv2d_nchw_1[1] = T.float32(0.0) - conv2d_nchw_1[2] = T.float32(0.0) - conv2d_nchw_1[3] = T.float32(0.0) - conv2d_nchw_1[4] = T.float32(0.0) - conv2d_nchw_1[5] = T.float32(0.0) - conv2d_nchw_1[6] = T.float32(0.0) - conv2d_nchw_1[7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - cse_var_2: T.int32 = rc_outer_outer * 3136 - cse_var_1: T.int32 = rc_outer_outer * 576 - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data_1 = T.Buffer((25088,), data=data.data) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 14: - pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel_1 = T.Buffer((2359296,), data=kernel.data) - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 49] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 98] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 147] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 196] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 245] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 294] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 343] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 441] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 490] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 539] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 588] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 637] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 686] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 735] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 833] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 882] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 931] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 980] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1029] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1078] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1127] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1176] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1225] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1274] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1323] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1372] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1421] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1470] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_2, 49): - if threadIdx_x_2 < 17: - kernel_shared_1[threadIdx_x_2 + 1519] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - cse_var_3: T.int32 = rc_outer_inner * 24 - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 192] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 384] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 576] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 195] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 387] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 579] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 6] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 198] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 390] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 582] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 9] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 201] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 393] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 585] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 12] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 204] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 396] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 588] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 15] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 207] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 399] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 591] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 18] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 210] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 402] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 594] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 21] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 213] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 405] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 597] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 768] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 960] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1152] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1344] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 771] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 963] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1155] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1347] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 774] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 966] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1158] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1350] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 777] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 969] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1161] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1353] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 780] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 972] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1164] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1356] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 783] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 975] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1167] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1359] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 786] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 978] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1170] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1362] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 789] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 981] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1173] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1365] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 193] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 385] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 577] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 4] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 196] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 388] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 580] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 7] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 199] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 391] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 583] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 10] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 202] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 394] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 586] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 13] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 205] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 397] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 589] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 16] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 208] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 400] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 592] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 19] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 211] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 403] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 595] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 22] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 214] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 406] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 598] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 769] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 961] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1153] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1345] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 772] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 964] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1156] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1348] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 775] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 967] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1159] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1351] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 778] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 970] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1162] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1354] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 781] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 973] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1165] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1357] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 784] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 976] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1168] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1360] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 787] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 979] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1171] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1363] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 790] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 982] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1174] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1366] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 2] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 194] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 386] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 578] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 5] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 197] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 389] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 581] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 8] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 200] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 392] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 584] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 11] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 203] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 395] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 587] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 14] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 206] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 398] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 590] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 17] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 209] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 401] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 593] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 20] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 212] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 404] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 596] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 23] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 215] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 407] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 599] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 770] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 962] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1154] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1346] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 773] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 965] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1157] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1349] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 776] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 968] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1160] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1352] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 779] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 971] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1163] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1355] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 782] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 974] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1166] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1358] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 785] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 977] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1169] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1361] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 788] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 980] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1172] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1364] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 791] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 983] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1175] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1367] - for i1_inner in range(8): - compute_1 = T.Buffer((25088,), data=compute.data) - bias_1 = T.Buffer((512,), data=bias.data) - compute_1[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 8 + i1_inner], T.float32(0.0)) -Phase 1 --------------------- -64 - -1 - -49 - -8 - -1024 - -1 - -0 - -2 - -0 - -4 - -T.float32(0.0) - -ff_outer_inner_init - -4 - -ff_outer_inner_init * 4 - -ff_inner_init - -ff_outer_inner_init * 4 + ff_inner_init - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -ff_outer_inner_init = T.int32() -ff_inner_init = T.int32() -conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) - -for ff_inner_init in range(4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - ff_outer_inner_init = T.int32() - conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) - -for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) - -0 - -8 - -0 - -3 - -4032 - -0 - -83 - -49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer - -7 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 - -threadIdx_x - -7 - -threadIdx_x // 7 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 - -576 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576 - -T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576) - -49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x - -4032 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032 - -T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032) - -1 - -7 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 - -7 - -threadIdx_x // 7 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 - -9 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 - -1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 - -7 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 - -7 - -threadIdx_x // 7 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 - -9 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 - -8 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 - -1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 - -1 - -rx_outer_outer - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -rc_outer_outer - -3136 - -rc_outer_outer * 3136 - -7 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 - -7 - -threadIdx_x // 7 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 - -9 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 - -49 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 - -7 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 - -7 - -threadIdx_x // 7 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 - -9 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 - -7 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.float32(0.0) - -T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -threadIdx_x = T.int32() -if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -threadIdx_x = T.int32() -if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1536 - -0 - -32 - -49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer - -49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 - -threadIdx_x - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x - -1536 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536 - -T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536) - -blockIdx_x - -36864 - -blockIdx_x * 36864 - -49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x - -192 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 - -4608 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x - -192 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 - -3 - -(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - -49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - -ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() -threadIdx_x = T.int32() -if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = T.int32() - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - -for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - -0 - -8 - -0 - -3 - -0 - -0 - -8 - -0 - -ff_outer_inner - -4 - -ff_outer_inner * 4 - -ff_inner - -ff_outer_inner * 4 + ff_inner - -conv2d_nchw[ff_outer_inner * 4 + ff_inner] - -rc_outer_inner - -504 - -rc_outer_inner * 504 - -rc_inner - -63 - -rc_inner * 63 - -rc_outer_inner * 504 + rc_inner * 63 - -ry_outer_inner - -7 - -ry_outer_inner * 7 - -rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 - -threadIdx_x - -rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] - -768 - -ff_outer_inner * 768 - -192 - -ff_inner * 192 - -ff_outer_inner * 768 + ff_inner * 192 - -24 - -rc_outer_inner * 24 - -ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 - -3 - -rc_inner * 3 - -ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 - -ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner - -kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -4 - -ff_outer_inner * 4 - -ff_outer_inner * 4 + ff_inner - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -ff_outer_inner = T.int32() -ff_inner = T.int32() -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -rc_inner = T.int32() -ry_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -for ff_inner in range(4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - ff_outer_inner = T.int32() - pad_temp_shared = T.Buffer((4032,), scope="shared") - rc_outer_inner = T.int32() - rc_inner = T.int32() - ry_outer_inner = T.int32() - threadIdx_x = T.int32() - kernel_shared = T.Buffer((1536,), scope="shared") - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -for rc_inner, ff_inner in T.grid(8, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - ff_outer_inner = T.int32() - pad_temp_shared = T.Buffer((4032,), scope="shared") - rc_outer_inner = T.int32() - ry_outer_inner = T.int32() - threadIdx_x = T.int32() - kernel_shared = T.Buffer((1536,), scope="shared") - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -for ff_outer_inner, rc_inner, ff_inner in T.grid(2, 8, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - pad_temp_shared = T.Buffer((4032,), scope="shared") - rc_outer_inner = T.int32() - ry_outer_inner = T.int32() - threadIdx_x = T.int32() - kernel_shared = T.Buffer((1536,), scope="shared") - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -for ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(3, 2, 8, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - pad_temp_shared = T.Buffer((4032,), scope="shared") - rc_outer_inner = T.int32() - threadIdx_x = T.int32() - kernel_shared = T.Buffer((1536,), scope="shared") - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - pad_temp_shared = T.Buffer((4032,), scope="shared") - threadIdx_x = T.int32() - kernel_shared = T.Buffer((1536,), scope="shared") - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -kernel_shared = T.Buffer((1536,), scope="shared") -for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] -for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - pad_temp_shared = T.Buffer((4032,), scope="shared") - threadIdx_x = T.int32() - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -T.bool(True) - -with T.allocate([1536], "float32", "shared") as kernel_shared: - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - pad_temp_shared = T.Buffer((4032,), scope="shared") - threadIdx_x = T.int32() - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -rc_outer_outer = T.int32() -for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - data = T.Buffer((25088,)) - pad_temp_shared[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -kernel_shared = T.allocate([1536], "float32", "shared") -kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") -for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] -for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - threadIdx_x = T.int32() - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -T.bool(True) - -with T.allocate([4032], "float32", "shared") as pad_temp_shared: - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - rx_outer_outer = T.int32() - rc_outer_outer = T.int32() - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - threadIdx_x = T.int32() - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -for rx_outer_outer in range(3): - pad_temp_shared = T.allocate([4032], "float32", "shared") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - rc_outer_outer = T.int32() - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - threadIdx_x = T.int32() - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.allocate([4032], "float32", "shared") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - threadIdx_x = T.int32() - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) -for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.allocate([4032], "float32", "shared") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - threadIdx_x = T.int32() - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -nn_outer_outer_outer_outer = T.int32() -with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.allocate([4032], "float32", "shared") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - threadIdx_x = T.int32() - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -nn_outer_outer_outer_outer = T.int32() -with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.allocate([4032], "float32", "shared") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - threadIdx_x = T.int32() - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - -0 - -8 - -i1_inner - -conv2d_nchw[i1_inner] - -8 - -blockIdx_x * 8 - -blockIdx_x * 8 + i1_inner - -bias[blockIdx_x * 8 + i1_inner] - -conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner] - -T.float32(0.0) - -T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -392 - -blockIdx_x * 392 - -49 - -i1_inner * 49 - -blockIdx_x * 392 + i1_inner * 49 - -blockIdx_x * 392 + i1_inner * 49 + threadIdx_x - -compute = T.Buffer((25088,)) -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -i1_inner = T.int32() -bias = T.Buffer((512,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -for i1_inner in range(8): - compute = T.Buffer((25088,)) - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - bias = T.Buffer((512,)) - blockIdx_x = T.int32() - threadIdx_x = T.int32() - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -nn_outer_outer_outer_outer = T.int32() -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.allocate([4032], "float32", "shared") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw[ff_outer_inner * 4 + ff_inner] = conv2d_nchw[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] -for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -T.bool(True) - -with T.allocate([8], "float32", "local") as conv2d_nchw: - nn_outer_outer_outer_outer = T.int32() - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - blockIdx_x = T.int32() - threadIdx_x = T.int32() - with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw_1[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.allocate([4032], "float32", "shared") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x < 1536): - kernel = T.Buffer((2359296,)) - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] = conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - conv2d_nchw = T.allocate([8], "float32", "local") - nn_outer_outer_outer_outer = T.int32() - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - blockIdx_x = T.int32() - with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw_1[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.allocate([4032], "float32", "shared") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 1536): - kernel = T.Buffer((2359296,)) - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] = conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -with T.launch_thread("vthread", 1) as vthread: - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw = T.allocate([8], "float32", "local") - nn_outer_outer_outer_outer = T.int32() - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - blockIdx_x = T.int32() - with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw_1[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.allocate([4032], "float32", "shared") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 1536): - kernel = T.Buffer((2359296,)) - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] = conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -with T.launch_thread("blockIdx.x", 64) as blockIdx_x: - vthread = T.launch_thread("vthread", 1) - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw = T.allocate([8], "float32", "local") - nn_outer_outer_outer_outer = T.int32() - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - with T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_auto_unroll_max_step", 1024): - T.attr(T.iter_var(nn_outer_outer_outer_outer, None, "DataPar", ""), "pragma_unroll_explicit", 1) - for ff_outer_inner_init, ff_inner_init in T.grid(2, 4): - conv2d_nchw_1[ff_outer_inner_init * 4 + ff_inner_init] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - pad_temp_shared = T.allocate([4032], "float32", "shared") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(83): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7 < 576): - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 4032): - data = T.Buffer((25088,)) - pad_temp_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = T.if_then_else(1 <= (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 and (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) // 9 * 49 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - kernel_shared = T.allocate([1536], "float32", "shared") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(32): - threadIdx_x_1 = T.launch_thread("threadIdx.x", 49) - if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1 < 1536): - kernel = T.Buffer((2359296,)) - kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49 + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - for rc_outer_inner, ry_outer_inner, ff_outer_inner, rc_inner, ff_inner in T.grid(8, 3, 2, 8, 4): - conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] = conv2d_nchw_1[ff_outer_inner * 4 + ff_inner] + pad_temp_shared_1[rc_outer_inner * 504 + rc_inner * 63 + ry_outer_inner * 7 + threadIdx_x] * kernel_shared_1[ff_outer_inner * 768 + ff_inner * 192 + rc_outer_inner * 24 + rc_inner * 3 + ry_outer_inner] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -# from tvm.script import ir as I -# from tvm.script import tir as T - -@I.ir_module -class Module: - @T.prim_func - def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")): - T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)}) - blockIdx_x = T.launch_thread("blockIdx.x", 64) - conv2d_nchw = T.allocate([8], "float32", "local") - pad_temp_shared = T.allocate([4032], "float32", "shared") - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - conv2d_nchw_1[0] = T.float32(0.0) - conv2d_nchw_1[1] = T.float32(0.0) - conv2d_nchw_1[2] = T.float32(0.0) - conv2d_nchw_1[3] = T.float32(0.0) - conv2d_nchw_1[4] = T.float32(0.0) - conv2d_nchw_1[5] = T.float32(0.0) - conv2d_nchw_1[6] = T.float32(0.0) - conv2d_nchw_1[7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - cse_var_2: T.int32 = rc_outer_outer * 3136 - cse_var_1: T.int32 = rc_outer_outer * 576 - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data_1 = T.Buffer((25088,), data=data.data) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 14: - pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel_1 = T.Buffer((2359296,), data=kernel.data) - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 49] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 98] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 147] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 196] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 245] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 294] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 343] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 441] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 490] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 539] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 588] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 637] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 686] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 735] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 833] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 882] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 931] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 980] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1029] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1078] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1127] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1176] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1225] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1274] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1323] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1372] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1421] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1470] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_2, 49): - if threadIdx_x_2 < 17: - kernel_shared_1[threadIdx_x_2 + 1519] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - cse_var_3: T.int32 = rc_outer_inner * 24 - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 192] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 384] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 576] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 195] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 387] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 579] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 6] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 198] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 390] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 582] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 9] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 201] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 393] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 585] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 12] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 204] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 396] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 588] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 15] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 207] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 399] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 591] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 18] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 210] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 402] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 594] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 21] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 213] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 405] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 597] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 768] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 960] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1152] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1344] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 771] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 963] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1155] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1347] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 774] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 966] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1158] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1350] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 777] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 969] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1161] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1353] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 780] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 972] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1164] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1356] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 783] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 975] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1167] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1359] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 786] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 978] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1170] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1362] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 789] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 981] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1173] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1365] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 193] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 385] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 577] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 4] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 196] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 388] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 580] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 7] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 199] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 391] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 583] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 10] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 202] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 394] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 586] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 13] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 205] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 397] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 589] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 16] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 208] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 400] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 592] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 19] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 211] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 403] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 595] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 22] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 214] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 406] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 598] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 769] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 961] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1153] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1345] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 772] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 964] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1156] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1348] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 775] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 967] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1159] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1351] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 778] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 970] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1162] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1354] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 781] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 973] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1165] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1357] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 784] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 976] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1168] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1360] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 787] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 979] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1171] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1363] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 790] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 982] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1174] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1366] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 2] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 194] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 386] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 578] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 5] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 197] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 389] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 581] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 8] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 200] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 392] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 584] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 11] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 203] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 395] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 587] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 14] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 206] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 398] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 590] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 17] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 209] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 401] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 593] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 20] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 212] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 404] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 596] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 23] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 215] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 407] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 599] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 770] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 962] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1154] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1346] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 773] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 965] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1157] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1349] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 776] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 968] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1160] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1352] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 779] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 971] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1163] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1355] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 782] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 974] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1166] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1358] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 785] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 977] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1169] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1361] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 788] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 980] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1172] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1364] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 791] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 983] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1175] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1367] - for i1_inner in range(8): - compute_1 = T.Buffer((25088,), data=compute.data) - bias_1 = T.Buffer((512,), data=bias.data) - compute_1[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 8 + i1_inner], T.float32(0.0)) -Phase 2 --------------------- -64 - -8 - -4032 - -1536 - -49 - -T.float32(0.0) - -0 - -4 - -T.Mul(0, 4) - -0 - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) - -T.Mul(0, 4) - -1 - -T.Mul(0, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[T.Mul(0, 4) + 1] = T.float32(0.0) - -T.Mul(0, 4) - -2 - -T.Mul(0, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[T.Mul(0, 4) + 2] = T.float32(0.0) - -T.Mul(0, 4) - -3 - -T.Mul(0, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[T.Mul(0, 4) + 3] = T.float32(0.0) - -1 - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[T.Mul(1, 4) + 1] = T.float32(0.0) - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[T.Mul(1, 4) + 2] = T.float32(0.0) - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[T.Mul(1, 4) + 3] = T.float32(0.0) - -0 - -8 - -0 - -3 - -49 - -0 - -7 - -T.Mul(0, 7) - -threadIdx_x - -7 - -threadIdx_x // 7 - -T.Mul(0, 7) + threadIdx_x // 7 - -576 - -T.Mul(0, 7) + threadIdx_x // 7 < 576 - -49 - -T.Mul(0, 49) - -T.Mul(0, 49) + threadIdx_x - -4032 - -T.Mul(0, 49) + threadIdx_x < 4032 - -1 - -7 - -T.Mul(0, 7) - -7 - -threadIdx_x // 7 - -T.Mul(0, 7) + threadIdx_x // 7 - -9 - -(T.Mul(0, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 - -7 - -T.Mul(0, 7) - -7 - -threadIdx_x // 7 - -T.Mul(0, 7) + threadIdx_x // 7 - -9 - -(T.Mul(0, 7) + threadIdx_x // 7) % 9 - -8 - -(T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 - -1 - -rx_outer_outer - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -rc_outer_outer - -3136 - -rc_outer_outer * 3136 - -7 - -T.Mul(0, 7) - -7 - -threadIdx_x // 7 - -T.Mul(0, 7) + threadIdx_x // 7 - -9 - -(T.Mul(0, 7) + threadIdx_x // 7) // 9 - -49 - -(T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 - -7 - -T.Mul(0, 7) - -7 - -threadIdx_x // 7 - -T.Mul(0, 7) + threadIdx_x // 7 - -9 - -(T.Mul(0, 7) + threadIdx_x // 7) % 9 - -7 - -(T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.float32(0.0) - -T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -49 - -T.Mul(0, 49) - -T.Mul(0, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(0, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(0, 7) + threadIdx_x // 7 < 576: - if T.Mul(0, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(0, 7) + threadIdx_x // 7 < 576: - if T.Mul(0, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -T.Mul(1, 7) - -T.Mul(1, 7) + threadIdx_x // 7 - -T.Mul(1, 7) + threadIdx_x // 7 < 576 - -T.Mul(1, 49) - -T.Mul(1, 49) + threadIdx_x - -T.Mul(1, 49) + threadIdx_x < 4032 - -T.Mul(1, 7) - -T.Mul(1, 7) + threadIdx_x // 7 - -(T.Mul(1, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 - -T.Mul(1, 7) - -T.Mul(1, 7) + threadIdx_x // 7 - -(T.Mul(1, 7) + threadIdx_x // 7) % 9 - -(T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(1, 7) - -T.Mul(1, 7) + threadIdx_x // 7 - -(T.Mul(1, 7) + threadIdx_x // 7) // 9 - -(T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(1, 7) - -T.Mul(1, 7) + threadIdx_x // 7 - -(T.Mul(1, 7) + threadIdx_x // 7) % 9 - -(T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(1, 49) - -T.Mul(1, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(1, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(1, 7) + threadIdx_x // 7 < 576: - if T.Mul(1, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(1, 7) + threadIdx_x // 7 < 576: - if T.Mul(1, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2 - -T.Mul(2, 7) - -T.Mul(2, 7) + threadIdx_x // 7 - -T.Mul(2, 7) + threadIdx_x // 7 < 576 - -T.Mul(2, 49) - -T.Mul(2, 49) + threadIdx_x - -T.Mul(2, 49) + threadIdx_x < 4032 - -T.Mul(2, 7) - -T.Mul(2, 7) + threadIdx_x // 7 - -(T.Mul(2, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 - -T.Mul(2, 7) - -T.Mul(2, 7) + threadIdx_x // 7 - -(T.Mul(2, 7) + threadIdx_x // 7) % 9 - -(T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(2, 7) - -T.Mul(2, 7) + threadIdx_x // 7 - -(T.Mul(2, 7) + threadIdx_x // 7) // 9 - -(T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(2, 7) - -T.Mul(2, 7) + threadIdx_x // 7 - -(T.Mul(2, 7) + threadIdx_x // 7) % 9 - -(T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(2, 49) - -T.Mul(2, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(2, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(2, 7) + threadIdx_x // 7 < 576: - if T.Mul(2, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(2, 7) + threadIdx_x // 7 < 576: - if T.Mul(2, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3 - -T.Mul(3, 7) - -T.Mul(3, 7) + threadIdx_x // 7 - -T.Mul(3, 7) + threadIdx_x // 7 < 576 - -T.Mul(3, 49) - -T.Mul(3, 49) + threadIdx_x - -T.Mul(3, 49) + threadIdx_x < 4032 - -T.Mul(3, 7) - -T.Mul(3, 7) + threadIdx_x // 7 - -(T.Mul(3, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 - -T.Mul(3, 7) - -T.Mul(3, 7) + threadIdx_x // 7 - -(T.Mul(3, 7) + threadIdx_x // 7) % 9 - -(T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(3, 7) - -T.Mul(3, 7) + threadIdx_x // 7 - -(T.Mul(3, 7) + threadIdx_x // 7) // 9 - -(T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(3, 7) - -T.Mul(3, 7) + threadIdx_x // 7 - -(T.Mul(3, 7) + threadIdx_x // 7) % 9 - -(T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(3, 49) - -T.Mul(3, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(3, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(3, 7) + threadIdx_x // 7 < 576: - if T.Mul(3, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(3, 7) + threadIdx_x // 7 < 576: - if T.Mul(3, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -4 - -T.Mul(4, 7) - -T.Mul(4, 7) + threadIdx_x // 7 - -T.Mul(4, 7) + threadIdx_x // 7 < 576 - -T.Mul(4, 49) - -T.Mul(4, 49) + threadIdx_x - -T.Mul(4, 49) + threadIdx_x < 4032 - -T.Mul(4, 7) - -T.Mul(4, 7) + threadIdx_x // 7 - -(T.Mul(4, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 - -T.Mul(4, 7) - -T.Mul(4, 7) + threadIdx_x // 7 - -(T.Mul(4, 7) + threadIdx_x // 7) % 9 - -(T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(4, 7) - -T.Mul(4, 7) + threadIdx_x // 7 - -(T.Mul(4, 7) + threadIdx_x // 7) // 9 - -(T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(4, 7) - -T.Mul(4, 7) + threadIdx_x // 7 - -(T.Mul(4, 7) + threadIdx_x // 7) % 9 - -(T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(4, 49) - -T.Mul(4, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(4, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(4, 7) + threadIdx_x // 7 < 576: - if T.Mul(4, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(4, 7) + threadIdx_x // 7 < 576: - if T.Mul(4, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -5 - -T.Mul(5, 7) - -T.Mul(5, 7) + threadIdx_x // 7 - -T.Mul(5, 7) + threadIdx_x // 7 < 576 - -T.Mul(5, 49) - -T.Mul(5, 49) + threadIdx_x - -T.Mul(5, 49) + threadIdx_x < 4032 - -T.Mul(5, 7) - -T.Mul(5, 7) + threadIdx_x // 7 - -(T.Mul(5, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 - -T.Mul(5, 7) - -T.Mul(5, 7) + threadIdx_x // 7 - -(T.Mul(5, 7) + threadIdx_x // 7) % 9 - -(T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(5, 7) - -T.Mul(5, 7) + threadIdx_x // 7 - -(T.Mul(5, 7) + threadIdx_x // 7) // 9 - -(T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(5, 7) - -T.Mul(5, 7) + threadIdx_x // 7 - -(T.Mul(5, 7) + threadIdx_x // 7) % 9 - -(T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(5, 49) - -T.Mul(5, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(5, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(5, 7) + threadIdx_x // 7 < 576: - if T.Mul(5, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(5, 7) + threadIdx_x // 7 < 576: - if T.Mul(5, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -6 - -T.Mul(6, 7) - -T.Mul(6, 7) + threadIdx_x // 7 - -T.Mul(6, 7) + threadIdx_x // 7 < 576 - -T.Mul(6, 49) - -T.Mul(6, 49) + threadIdx_x - -T.Mul(6, 49) + threadIdx_x < 4032 - -T.Mul(6, 7) - -T.Mul(6, 7) + threadIdx_x // 7 - -(T.Mul(6, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 - -T.Mul(6, 7) - -T.Mul(6, 7) + threadIdx_x // 7 - -(T.Mul(6, 7) + threadIdx_x // 7) % 9 - -(T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(6, 7) - -T.Mul(6, 7) + threadIdx_x // 7 - -(T.Mul(6, 7) + threadIdx_x // 7) // 9 - -(T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(6, 7) - -T.Mul(6, 7) + threadIdx_x // 7 - -(T.Mul(6, 7) + threadIdx_x // 7) % 9 - -(T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(6, 49) - -T.Mul(6, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(6, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(6, 7) + threadIdx_x // 7 < 576: - if T.Mul(6, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(6, 7) + threadIdx_x // 7 < 576: - if T.Mul(6, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -7 - -T.Mul(7, 7) - -T.Mul(7, 7) + threadIdx_x // 7 - -T.Mul(7, 7) + threadIdx_x // 7 < 576 - -T.Mul(7, 49) - -T.Mul(7, 49) + threadIdx_x - -T.Mul(7, 49) + threadIdx_x < 4032 - -T.Mul(7, 7) - -T.Mul(7, 7) + threadIdx_x // 7 - -(T.Mul(7, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 - -T.Mul(7, 7) - -T.Mul(7, 7) + threadIdx_x // 7 - -(T.Mul(7, 7) + threadIdx_x // 7) % 9 - -(T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(7, 7) - -T.Mul(7, 7) + threadIdx_x // 7 - -(T.Mul(7, 7) + threadIdx_x // 7) // 9 - -(T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(7, 7) - -T.Mul(7, 7) + threadIdx_x // 7 - -(T.Mul(7, 7) + threadIdx_x // 7) % 9 - -(T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(7, 49) - -T.Mul(7, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(7, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(7, 7) + threadIdx_x // 7 < 576: - if T.Mul(7, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(7, 7) + threadIdx_x // 7 < 576: - if T.Mul(7, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -8 - -T.Mul(8, 7) - -T.Mul(8, 7) + threadIdx_x // 7 - -T.Mul(8, 7) + threadIdx_x // 7 < 576 - -T.Mul(8, 49) - -T.Mul(8, 49) + threadIdx_x - -T.Mul(8, 49) + threadIdx_x < 4032 - -T.Mul(8, 7) - -T.Mul(8, 7) + threadIdx_x // 7 - -(T.Mul(8, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 - -T.Mul(8, 7) - -T.Mul(8, 7) + threadIdx_x // 7 - -(T.Mul(8, 7) + threadIdx_x // 7) % 9 - -(T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(8, 7) - -T.Mul(8, 7) + threadIdx_x // 7 - -(T.Mul(8, 7) + threadIdx_x // 7) // 9 - -(T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(8, 7) - -T.Mul(8, 7) + threadIdx_x // 7 - -(T.Mul(8, 7) + threadIdx_x // 7) % 9 - -(T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(8, 49) - -T.Mul(8, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(8, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(8, 7) + threadIdx_x // 7 < 576: - if T.Mul(8, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(8, 7) + threadIdx_x // 7 < 576: - if T.Mul(8, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -9 - -T.Mul(9, 7) - -T.Mul(9, 7) + threadIdx_x // 7 - -T.Mul(9, 7) + threadIdx_x // 7 < 576 - -T.Mul(9, 49) - -T.Mul(9, 49) + threadIdx_x - -T.Mul(9, 49) + threadIdx_x < 4032 - -T.Mul(9, 7) - -T.Mul(9, 7) + threadIdx_x // 7 - -(T.Mul(9, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 - -T.Mul(9, 7) - -T.Mul(9, 7) + threadIdx_x // 7 - -(T.Mul(9, 7) + threadIdx_x // 7) % 9 - -(T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(9, 7) - -T.Mul(9, 7) + threadIdx_x // 7 - -(T.Mul(9, 7) + threadIdx_x // 7) // 9 - -(T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(9, 7) - -T.Mul(9, 7) + threadIdx_x // 7 - -(T.Mul(9, 7) + threadIdx_x // 7) % 9 - -(T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(9, 49) - -T.Mul(9, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(9, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(9, 7) + threadIdx_x // 7 < 576: - if T.Mul(9, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(9, 7) + threadIdx_x // 7 < 576: - if T.Mul(9, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -10 - -T.Mul(10, 7) - -T.Mul(10, 7) + threadIdx_x // 7 - -T.Mul(10, 7) + threadIdx_x // 7 < 576 - -T.Mul(10, 49) - -T.Mul(10, 49) + threadIdx_x - -T.Mul(10, 49) + threadIdx_x < 4032 - -T.Mul(10, 7) - -T.Mul(10, 7) + threadIdx_x // 7 - -(T.Mul(10, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 - -T.Mul(10, 7) - -T.Mul(10, 7) + threadIdx_x // 7 - -(T.Mul(10, 7) + threadIdx_x // 7) % 9 - -(T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(10, 7) - -T.Mul(10, 7) + threadIdx_x // 7 - -(T.Mul(10, 7) + threadIdx_x // 7) // 9 - -(T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(10, 7) - -T.Mul(10, 7) + threadIdx_x // 7 - -(T.Mul(10, 7) + threadIdx_x // 7) % 9 - -(T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(10, 49) - -T.Mul(10, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(10, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(10, 7) + threadIdx_x // 7 < 576: - if T.Mul(10, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(10, 7) + threadIdx_x // 7 < 576: - if T.Mul(10, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -11 - -T.Mul(11, 7) - -T.Mul(11, 7) + threadIdx_x // 7 - -T.Mul(11, 7) + threadIdx_x // 7 < 576 - -T.Mul(11, 49) - -T.Mul(11, 49) + threadIdx_x - -T.Mul(11, 49) + threadIdx_x < 4032 - -T.Mul(11, 7) - -T.Mul(11, 7) + threadIdx_x // 7 - -(T.Mul(11, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 - -T.Mul(11, 7) - -T.Mul(11, 7) + threadIdx_x // 7 - -(T.Mul(11, 7) + threadIdx_x // 7) % 9 - -(T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(11, 7) - -T.Mul(11, 7) + threadIdx_x // 7 - -(T.Mul(11, 7) + threadIdx_x // 7) // 9 - -(T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(11, 7) - -T.Mul(11, 7) + threadIdx_x // 7 - -(T.Mul(11, 7) + threadIdx_x // 7) % 9 - -(T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(11, 49) - -T.Mul(11, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(11, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(11, 7) + threadIdx_x // 7 < 576: - if T.Mul(11, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(11, 7) + threadIdx_x // 7 < 576: - if T.Mul(11, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -12 - -T.Mul(12, 7) - -T.Mul(12, 7) + threadIdx_x // 7 - -T.Mul(12, 7) + threadIdx_x // 7 < 576 - -T.Mul(12, 49) - -T.Mul(12, 49) + threadIdx_x - -T.Mul(12, 49) + threadIdx_x < 4032 - -T.Mul(12, 7) - -T.Mul(12, 7) + threadIdx_x // 7 - -(T.Mul(12, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 - -T.Mul(12, 7) - -T.Mul(12, 7) + threadIdx_x // 7 - -(T.Mul(12, 7) + threadIdx_x // 7) % 9 - -(T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(12, 7) - -T.Mul(12, 7) + threadIdx_x // 7 - -(T.Mul(12, 7) + threadIdx_x // 7) // 9 - -(T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(12, 7) - -T.Mul(12, 7) + threadIdx_x // 7 - -(T.Mul(12, 7) + threadIdx_x // 7) % 9 - -(T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(12, 49) - -T.Mul(12, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(12, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(12, 7) + threadIdx_x // 7 < 576: - if T.Mul(12, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(12, 7) + threadIdx_x // 7 < 576: - if T.Mul(12, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -13 - -T.Mul(13, 7) - -T.Mul(13, 7) + threadIdx_x // 7 - -T.Mul(13, 7) + threadIdx_x // 7 < 576 - -T.Mul(13, 49) - -T.Mul(13, 49) + threadIdx_x - -T.Mul(13, 49) + threadIdx_x < 4032 - -T.Mul(13, 7) - -T.Mul(13, 7) + threadIdx_x // 7 - -(T.Mul(13, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 - -T.Mul(13, 7) - -T.Mul(13, 7) + threadIdx_x // 7 - -(T.Mul(13, 7) + threadIdx_x // 7) % 9 - -(T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(13, 7) - -T.Mul(13, 7) + threadIdx_x // 7 - -(T.Mul(13, 7) + threadIdx_x // 7) // 9 - -(T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(13, 7) - -T.Mul(13, 7) + threadIdx_x // 7 - -(T.Mul(13, 7) + threadIdx_x // 7) % 9 - -(T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(13, 49) - -T.Mul(13, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(13, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(13, 7) + threadIdx_x // 7 < 576: - if T.Mul(13, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(13, 7) + threadIdx_x // 7 < 576: - if T.Mul(13, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -14 - -T.Mul(14, 7) - -T.Mul(14, 7) + threadIdx_x // 7 - -T.Mul(14, 7) + threadIdx_x // 7 < 576 - -T.Mul(14, 49) - -T.Mul(14, 49) + threadIdx_x - -T.Mul(14, 49) + threadIdx_x < 4032 - -T.Mul(14, 7) - -T.Mul(14, 7) + threadIdx_x // 7 - -(T.Mul(14, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 - -T.Mul(14, 7) - -T.Mul(14, 7) + threadIdx_x // 7 - -(T.Mul(14, 7) + threadIdx_x // 7) % 9 - -(T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(14, 7) - -T.Mul(14, 7) + threadIdx_x // 7 - -(T.Mul(14, 7) + threadIdx_x // 7) // 9 - -(T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(14, 7) - -T.Mul(14, 7) + threadIdx_x // 7 - -(T.Mul(14, 7) + threadIdx_x // 7) % 9 - -(T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(14, 49) - -T.Mul(14, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(14, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(14, 7) + threadIdx_x // 7 < 576: - if T.Mul(14, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(14, 7) + threadIdx_x // 7 < 576: - if T.Mul(14, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -15 - -T.Mul(15, 7) - -T.Mul(15, 7) + threadIdx_x // 7 - -T.Mul(15, 7) + threadIdx_x // 7 < 576 - -T.Mul(15, 49) - -T.Mul(15, 49) + threadIdx_x - -T.Mul(15, 49) + threadIdx_x < 4032 - -T.Mul(15, 7) - -T.Mul(15, 7) + threadIdx_x // 7 - -(T.Mul(15, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 - -T.Mul(15, 7) - -T.Mul(15, 7) + threadIdx_x // 7 - -(T.Mul(15, 7) + threadIdx_x // 7) % 9 - -(T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(15, 7) - -T.Mul(15, 7) + threadIdx_x // 7 - -(T.Mul(15, 7) + threadIdx_x // 7) // 9 - -(T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(15, 7) - -T.Mul(15, 7) + threadIdx_x // 7 - -(T.Mul(15, 7) + threadIdx_x // 7) % 9 - -(T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(15, 49) - -T.Mul(15, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(15, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(15, 7) + threadIdx_x // 7 < 576: - if T.Mul(15, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(15, 7) + threadIdx_x // 7 < 576: - if T.Mul(15, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -16 - -T.Mul(16, 7) - -T.Mul(16, 7) + threadIdx_x // 7 - -T.Mul(16, 7) + threadIdx_x // 7 < 576 - -T.Mul(16, 49) - -T.Mul(16, 49) + threadIdx_x - -T.Mul(16, 49) + threadIdx_x < 4032 - -T.Mul(16, 7) - -T.Mul(16, 7) + threadIdx_x // 7 - -(T.Mul(16, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 - -T.Mul(16, 7) - -T.Mul(16, 7) + threadIdx_x // 7 - -(T.Mul(16, 7) + threadIdx_x // 7) % 9 - -(T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(16, 7) - -T.Mul(16, 7) + threadIdx_x // 7 - -(T.Mul(16, 7) + threadIdx_x // 7) // 9 - -(T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(16, 7) - -T.Mul(16, 7) + threadIdx_x // 7 - -(T.Mul(16, 7) + threadIdx_x // 7) % 9 - -(T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(16, 49) - -T.Mul(16, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(16, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(16, 7) + threadIdx_x // 7 < 576: - if T.Mul(16, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(16, 7) + threadIdx_x // 7 < 576: - if T.Mul(16, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -17 - -T.Mul(17, 7) - -T.Mul(17, 7) + threadIdx_x // 7 - -T.Mul(17, 7) + threadIdx_x // 7 < 576 - -T.Mul(17, 49) - -T.Mul(17, 49) + threadIdx_x - -T.Mul(17, 49) + threadIdx_x < 4032 - -T.Mul(17, 7) - -T.Mul(17, 7) + threadIdx_x // 7 - -(T.Mul(17, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 - -T.Mul(17, 7) - -T.Mul(17, 7) + threadIdx_x // 7 - -(T.Mul(17, 7) + threadIdx_x // 7) % 9 - -(T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(17, 7) - -T.Mul(17, 7) + threadIdx_x // 7 - -(T.Mul(17, 7) + threadIdx_x // 7) // 9 - -(T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(17, 7) - -T.Mul(17, 7) + threadIdx_x // 7 - -(T.Mul(17, 7) + threadIdx_x // 7) % 9 - -(T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(17, 49) - -T.Mul(17, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(17, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(17, 7) + threadIdx_x // 7 < 576: - if T.Mul(17, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(17, 7) + threadIdx_x // 7 < 576: - if T.Mul(17, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -18 - -T.Mul(18, 7) - -T.Mul(18, 7) + threadIdx_x // 7 - -T.Mul(18, 7) + threadIdx_x // 7 < 576 - -T.Mul(18, 49) - -T.Mul(18, 49) + threadIdx_x - -T.Mul(18, 49) + threadIdx_x < 4032 - -T.Mul(18, 7) - -T.Mul(18, 7) + threadIdx_x // 7 - -(T.Mul(18, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 - -T.Mul(18, 7) - -T.Mul(18, 7) + threadIdx_x // 7 - -(T.Mul(18, 7) + threadIdx_x // 7) % 9 - -(T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(18, 7) - -T.Mul(18, 7) + threadIdx_x // 7 - -(T.Mul(18, 7) + threadIdx_x // 7) // 9 - -(T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(18, 7) - -T.Mul(18, 7) + threadIdx_x // 7 - -(T.Mul(18, 7) + threadIdx_x // 7) % 9 - -(T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(18, 49) - -T.Mul(18, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(18, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(18, 7) + threadIdx_x // 7 < 576: - if T.Mul(18, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(18, 7) + threadIdx_x // 7 < 576: - if T.Mul(18, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -19 - -T.Mul(19, 7) - -T.Mul(19, 7) + threadIdx_x // 7 - -T.Mul(19, 7) + threadIdx_x // 7 < 576 - -T.Mul(19, 49) - -T.Mul(19, 49) + threadIdx_x - -T.Mul(19, 49) + threadIdx_x < 4032 - -T.Mul(19, 7) - -T.Mul(19, 7) + threadIdx_x // 7 - -(T.Mul(19, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 - -T.Mul(19, 7) - -T.Mul(19, 7) + threadIdx_x // 7 - -(T.Mul(19, 7) + threadIdx_x // 7) % 9 - -(T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(19, 7) - -T.Mul(19, 7) + threadIdx_x // 7 - -(T.Mul(19, 7) + threadIdx_x // 7) // 9 - -(T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(19, 7) - -T.Mul(19, 7) + threadIdx_x // 7 - -(T.Mul(19, 7) + threadIdx_x // 7) % 9 - -(T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(19, 49) - -T.Mul(19, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(19, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(19, 7) + threadIdx_x // 7 < 576: - if T.Mul(19, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(19, 7) + threadIdx_x // 7 < 576: - if T.Mul(19, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -20 - -T.Mul(20, 7) - -T.Mul(20, 7) + threadIdx_x // 7 - -T.Mul(20, 7) + threadIdx_x // 7 < 576 - -T.Mul(20, 49) - -T.Mul(20, 49) + threadIdx_x - -T.Mul(20, 49) + threadIdx_x < 4032 - -T.Mul(20, 7) - -T.Mul(20, 7) + threadIdx_x // 7 - -(T.Mul(20, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 - -T.Mul(20, 7) - -T.Mul(20, 7) + threadIdx_x // 7 - -(T.Mul(20, 7) + threadIdx_x // 7) % 9 - -(T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(20, 7) - -T.Mul(20, 7) + threadIdx_x // 7 - -(T.Mul(20, 7) + threadIdx_x // 7) // 9 - -(T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(20, 7) - -T.Mul(20, 7) + threadIdx_x // 7 - -(T.Mul(20, 7) + threadIdx_x // 7) % 9 - -(T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(20, 49) - -T.Mul(20, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(20, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(20, 7) + threadIdx_x // 7 < 576: - if T.Mul(20, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(20, 7) + threadIdx_x // 7 < 576: - if T.Mul(20, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -21 - -T.Mul(21, 7) - -T.Mul(21, 7) + threadIdx_x // 7 - -T.Mul(21, 7) + threadIdx_x // 7 < 576 - -T.Mul(21, 49) - -T.Mul(21, 49) + threadIdx_x - -T.Mul(21, 49) + threadIdx_x < 4032 - -T.Mul(21, 7) - -T.Mul(21, 7) + threadIdx_x // 7 - -(T.Mul(21, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 - -T.Mul(21, 7) - -T.Mul(21, 7) + threadIdx_x // 7 - -(T.Mul(21, 7) + threadIdx_x // 7) % 9 - -(T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(21, 7) - -T.Mul(21, 7) + threadIdx_x // 7 - -(T.Mul(21, 7) + threadIdx_x // 7) // 9 - -(T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(21, 7) - -T.Mul(21, 7) + threadIdx_x // 7 - -(T.Mul(21, 7) + threadIdx_x // 7) % 9 - -(T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(21, 49) - -T.Mul(21, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(21, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(21, 7) + threadIdx_x // 7 < 576: - if T.Mul(21, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(21, 7) + threadIdx_x // 7 < 576: - if T.Mul(21, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -22 - -T.Mul(22, 7) - -T.Mul(22, 7) + threadIdx_x // 7 - -T.Mul(22, 7) + threadIdx_x // 7 < 576 - -T.Mul(22, 49) - -T.Mul(22, 49) + threadIdx_x - -T.Mul(22, 49) + threadIdx_x < 4032 - -T.Mul(22, 7) - -T.Mul(22, 7) + threadIdx_x // 7 - -(T.Mul(22, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 - -T.Mul(22, 7) - -T.Mul(22, 7) + threadIdx_x // 7 - -(T.Mul(22, 7) + threadIdx_x // 7) % 9 - -(T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(22, 7) - -T.Mul(22, 7) + threadIdx_x // 7 - -(T.Mul(22, 7) + threadIdx_x // 7) // 9 - -(T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(22, 7) - -T.Mul(22, 7) + threadIdx_x // 7 - -(T.Mul(22, 7) + threadIdx_x // 7) % 9 - -(T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(22, 49) - -T.Mul(22, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(22, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(22, 7) + threadIdx_x // 7 < 576: - if T.Mul(22, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(22, 7) + threadIdx_x // 7 < 576: - if T.Mul(22, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -23 - -T.Mul(23, 7) - -T.Mul(23, 7) + threadIdx_x // 7 - -T.Mul(23, 7) + threadIdx_x // 7 < 576 - -T.Mul(23, 49) - -T.Mul(23, 49) + threadIdx_x - -T.Mul(23, 49) + threadIdx_x < 4032 - -T.Mul(23, 7) - -T.Mul(23, 7) + threadIdx_x // 7 - -(T.Mul(23, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 - -T.Mul(23, 7) - -T.Mul(23, 7) + threadIdx_x // 7 - -(T.Mul(23, 7) + threadIdx_x // 7) % 9 - -(T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(23, 7) - -T.Mul(23, 7) + threadIdx_x // 7 - -(T.Mul(23, 7) + threadIdx_x // 7) // 9 - -(T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(23, 7) - -T.Mul(23, 7) + threadIdx_x // 7 - -(T.Mul(23, 7) + threadIdx_x // 7) % 9 - -(T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(23, 49) - -T.Mul(23, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(23, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(23, 7) + threadIdx_x // 7 < 576: - if T.Mul(23, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(23, 7) + threadIdx_x // 7 < 576: - if T.Mul(23, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -24 - -T.Mul(24, 7) - -T.Mul(24, 7) + threadIdx_x // 7 - -T.Mul(24, 7) + threadIdx_x // 7 < 576 - -T.Mul(24, 49) - -T.Mul(24, 49) + threadIdx_x - -T.Mul(24, 49) + threadIdx_x < 4032 - -T.Mul(24, 7) - -T.Mul(24, 7) + threadIdx_x // 7 - -(T.Mul(24, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 - -T.Mul(24, 7) - -T.Mul(24, 7) + threadIdx_x // 7 - -(T.Mul(24, 7) + threadIdx_x // 7) % 9 - -(T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(24, 7) - -T.Mul(24, 7) + threadIdx_x // 7 - -(T.Mul(24, 7) + threadIdx_x // 7) // 9 - -(T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(24, 7) - -T.Mul(24, 7) + threadIdx_x // 7 - -(T.Mul(24, 7) + threadIdx_x // 7) % 9 - -(T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(24, 49) - -T.Mul(24, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(24, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(24, 7) + threadIdx_x // 7 < 576: - if T.Mul(24, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(24, 7) + threadIdx_x // 7 < 576: - if T.Mul(24, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -25 - -T.Mul(25, 7) - -T.Mul(25, 7) + threadIdx_x // 7 - -T.Mul(25, 7) + threadIdx_x // 7 < 576 - -T.Mul(25, 49) - -T.Mul(25, 49) + threadIdx_x - -T.Mul(25, 49) + threadIdx_x < 4032 - -T.Mul(25, 7) - -T.Mul(25, 7) + threadIdx_x // 7 - -(T.Mul(25, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 - -T.Mul(25, 7) - -T.Mul(25, 7) + threadIdx_x // 7 - -(T.Mul(25, 7) + threadIdx_x // 7) % 9 - -(T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(25, 7) - -T.Mul(25, 7) + threadIdx_x // 7 - -(T.Mul(25, 7) + threadIdx_x // 7) // 9 - -(T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(25, 7) - -T.Mul(25, 7) + threadIdx_x // 7 - -(T.Mul(25, 7) + threadIdx_x // 7) % 9 - -(T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(25, 49) - -T.Mul(25, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(25, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(25, 7) + threadIdx_x // 7 < 576: - if T.Mul(25, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(25, 7) + threadIdx_x // 7 < 576: - if T.Mul(25, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -26 - -T.Mul(26, 7) - -T.Mul(26, 7) + threadIdx_x // 7 - -T.Mul(26, 7) + threadIdx_x // 7 < 576 - -T.Mul(26, 49) - -T.Mul(26, 49) + threadIdx_x - -T.Mul(26, 49) + threadIdx_x < 4032 - -T.Mul(26, 7) - -T.Mul(26, 7) + threadIdx_x // 7 - -(T.Mul(26, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 - -T.Mul(26, 7) - -T.Mul(26, 7) + threadIdx_x // 7 - -(T.Mul(26, 7) + threadIdx_x // 7) % 9 - -(T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(26, 7) - -T.Mul(26, 7) + threadIdx_x // 7 - -(T.Mul(26, 7) + threadIdx_x // 7) // 9 - -(T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(26, 7) - -T.Mul(26, 7) + threadIdx_x // 7 - -(T.Mul(26, 7) + threadIdx_x // 7) % 9 - -(T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(26, 49) - -T.Mul(26, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(26, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(26, 7) + threadIdx_x // 7 < 576: - if T.Mul(26, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(26, 7) + threadIdx_x // 7 < 576: - if T.Mul(26, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -27 - -T.Mul(27, 7) - -T.Mul(27, 7) + threadIdx_x // 7 - -T.Mul(27, 7) + threadIdx_x // 7 < 576 - -T.Mul(27, 49) - -T.Mul(27, 49) + threadIdx_x - -T.Mul(27, 49) + threadIdx_x < 4032 - -T.Mul(27, 7) - -T.Mul(27, 7) + threadIdx_x // 7 - -(T.Mul(27, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 - -T.Mul(27, 7) - -T.Mul(27, 7) + threadIdx_x // 7 - -(T.Mul(27, 7) + threadIdx_x // 7) % 9 - -(T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(27, 7) - -T.Mul(27, 7) + threadIdx_x // 7 - -(T.Mul(27, 7) + threadIdx_x // 7) // 9 - -(T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(27, 7) - -T.Mul(27, 7) + threadIdx_x // 7 - -(T.Mul(27, 7) + threadIdx_x // 7) % 9 - -(T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(27, 49) - -T.Mul(27, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(27, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(27, 7) + threadIdx_x // 7 < 576: - if T.Mul(27, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(27, 7) + threadIdx_x // 7 < 576: - if T.Mul(27, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -28 - -T.Mul(28, 7) - -T.Mul(28, 7) + threadIdx_x // 7 - -T.Mul(28, 7) + threadIdx_x // 7 < 576 - -T.Mul(28, 49) - -T.Mul(28, 49) + threadIdx_x - -T.Mul(28, 49) + threadIdx_x < 4032 - -T.Mul(28, 7) - -T.Mul(28, 7) + threadIdx_x // 7 - -(T.Mul(28, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 - -T.Mul(28, 7) - -T.Mul(28, 7) + threadIdx_x // 7 - -(T.Mul(28, 7) + threadIdx_x // 7) % 9 - -(T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(28, 7) - -T.Mul(28, 7) + threadIdx_x // 7 - -(T.Mul(28, 7) + threadIdx_x // 7) // 9 - -(T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(28, 7) - -T.Mul(28, 7) + threadIdx_x // 7 - -(T.Mul(28, 7) + threadIdx_x // 7) % 9 - -(T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(28, 49) - -T.Mul(28, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(28, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(28, 7) + threadIdx_x // 7 < 576: - if T.Mul(28, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(28, 7) + threadIdx_x // 7 < 576: - if T.Mul(28, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -29 - -T.Mul(29, 7) - -T.Mul(29, 7) + threadIdx_x // 7 - -T.Mul(29, 7) + threadIdx_x // 7 < 576 - -T.Mul(29, 49) - -T.Mul(29, 49) + threadIdx_x - -T.Mul(29, 49) + threadIdx_x < 4032 - -T.Mul(29, 7) - -T.Mul(29, 7) + threadIdx_x // 7 - -(T.Mul(29, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 - -T.Mul(29, 7) - -T.Mul(29, 7) + threadIdx_x // 7 - -(T.Mul(29, 7) + threadIdx_x // 7) % 9 - -(T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(29, 7) - -T.Mul(29, 7) + threadIdx_x // 7 - -(T.Mul(29, 7) + threadIdx_x // 7) // 9 - -(T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(29, 7) - -T.Mul(29, 7) + threadIdx_x // 7 - -(T.Mul(29, 7) + threadIdx_x // 7) % 9 - -(T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(29, 49) - -T.Mul(29, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(29, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(29, 7) + threadIdx_x // 7 < 576: - if T.Mul(29, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(29, 7) + threadIdx_x // 7 < 576: - if T.Mul(29, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -30 - -T.Mul(30, 7) - -T.Mul(30, 7) + threadIdx_x // 7 - -T.Mul(30, 7) + threadIdx_x // 7 < 576 - -T.Mul(30, 49) - -T.Mul(30, 49) + threadIdx_x - -T.Mul(30, 49) + threadIdx_x < 4032 - -T.Mul(30, 7) - -T.Mul(30, 7) + threadIdx_x // 7 - -(T.Mul(30, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 - -T.Mul(30, 7) - -T.Mul(30, 7) + threadIdx_x // 7 - -(T.Mul(30, 7) + threadIdx_x // 7) % 9 - -(T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(30, 7) - -T.Mul(30, 7) + threadIdx_x // 7 - -(T.Mul(30, 7) + threadIdx_x // 7) // 9 - -(T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(30, 7) - -T.Mul(30, 7) + threadIdx_x // 7 - -(T.Mul(30, 7) + threadIdx_x // 7) % 9 - -(T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(30, 49) - -T.Mul(30, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(30, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(30, 7) + threadIdx_x // 7 < 576: - if T.Mul(30, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(30, 7) + threadIdx_x // 7 < 576: - if T.Mul(30, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -31 - -T.Mul(31, 7) - -T.Mul(31, 7) + threadIdx_x // 7 - -T.Mul(31, 7) + threadIdx_x // 7 < 576 - -T.Mul(31, 49) - -T.Mul(31, 49) + threadIdx_x - -T.Mul(31, 49) + threadIdx_x < 4032 - -T.Mul(31, 7) - -T.Mul(31, 7) + threadIdx_x // 7 - -(T.Mul(31, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 - -T.Mul(31, 7) - -T.Mul(31, 7) + threadIdx_x // 7 - -(T.Mul(31, 7) + threadIdx_x // 7) % 9 - -(T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(31, 7) - -T.Mul(31, 7) + threadIdx_x // 7 - -(T.Mul(31, 7) + threadIdx_x // 7) // 9 - -(T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(31, 7) - -T.Mul(31, 7) + threadIdx_x // 7 - -(T.Mul(31, 7) + threadIdx_x // 7) % 9 - -(T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(31, 49) - -T.Mul(31, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(31, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(31, 7) + threadIdx_x // 7 < 576: - if T.Mul(31, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(31, 7) + threadIdx_x // 7 < 576: - if T.Mul(31, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -32 - -T.Mul(32, 7) - -T.Mul(32, 7) + threadIdx_x // 7 - -T.Mul(32, 7) + threadIdx_x // 7 < 576 - -T.Mul(32, 49) - -T.Mul(32, 49) + threadIdx_x - -T.Mul(32, 49) + threadIdx_x < 4032 - -T.Mul(32, 7) - -T.Mul(32, 7) + threadIdx_x // 7 - -(T.Mul(32, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 - -T.Mul(32, 7) - -T.Mul(32, 7) + threadIdx_x // 7 - -(T.Mul(32, 7) + threadIdx_x // 7) % 9 - -(T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(32, 7) - -T.Mul(32, 7) + threadIdx_x // 7 - -(T.Mul(32, 7) + threadIdx_x // 7) // 9 - -(T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(32, 7) - -T.Mul(32, 7) + threadIdx_x // 7 - -(T.Mul(32, 7) + threadIdx_x // 7) % 9 - -(T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(32, 49) - -T.Mul(32, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(32, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(32, 7) + threadIdx_x // 7 < 576: - if T.Mul(32, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(32, 7) + threadIdx_x // 7 < 576: - if T.Mul(32, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -33 - -T.Mul(33, 7) - -T.Mul(33, 7) + threadIdx_x // 7 - -T.Mul(33, 7) + threadIdx_x // 7 < 576 - -T.Mul(33, 49) - -T.Mul(33, 49) + threadIdx_x - -T.Mul(33, 49) + threadIdx_x < 4032 - -T.Mul(33, 7) - -T.Mul(33, 7) + threadIdx_x // 7 - -(T.Mul(33, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 - -T.Mul(33, 7) - -T.Mul(33, 7) + threadIdx_x // 7 - -(T.Mul(33, 7) + threadIdx_x // 7) % 9 - -(T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(33, 7) - -T.Mul(33, 7) + threadIdx_x // 7 - -(T.Mul(33, 7) + threadIdx_x // 7) // 9 - -(T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(33, 7) - -T.Mul(33, 7) + threadIdx_x // 7 - -(T.Mul(33, 7) + threadIdx_x // 7) % 9 - -(T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(33, 49) - -T.Mul(33, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(33, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(33, 7) + threadIdx_x // 7 < 576: - if T.Mul(33, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(33, 7) + threadIdx_x // 7 < 576: - if T.Mul(33, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -34 - -T.Mul(34, 7) - -T.Mul(34, 7) + threadIdx_x // 7 - -T.Mul(34, 7) + threadIdx_x // 7 < 576 - -T.Mul(34, 49) - -T.Mul(34, 49) + threadIdx_x - -T.Mul(34, 49) + threadIdx_x < 4032 - -T.Mul(34, 7) - -T.Mul(34, 7) + threadIdx_x // 7 - -(T.Mul(34, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 - -T.Mul(34, 7) - -T.Mul(34, 7) + threadIdx_x // 7 - -(T.Mul(34, 7) + threadIdx_x // 7) % 9 - -(T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(34, 7) - -T.Mul(34, 7) + threadIdx_x // 7 - -(T.Mul(34, 7) + threadIdx_x // 7) // 9 - -(T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(34, 7) - -T.Mul(34, 7) + threadIdx_x // 7 - -(T.Mul(34, 7) + threadIdx_x // 7) % 9 - -(T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(34, 49) - -T.Mul(34, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(34, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(34, 7) + threadIdx_x // 7 < 576: - if T.Mul(34, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(34, 7) + threadIdx_x // 7 < 576: - if T.Mul(34, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -35 - -T.Mul(35, 7) - -T.Mul(35, 7) + threadIdx_x // 7 - -T.Mul(35, 7) + threadIdx_x // 7 < 576 - -T.Mul(35, 49) - -T.Mul(35, 49) + threadIdx_x - -T.Mul(35, 49) + threadIdx_x < 4032 - -T.Mul(35, 7) - -T.Mul(35, 7) + threadIdx_x // 7 - -(T.Mul(35, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 - -T.Mul(35, 7) - -T.Mul(35, 7) + threadIdx_x // 7 - -(T.Mul(35, 7) + threadIdx_x // 7) % 9 - -(T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(35, 7) - -T.Mul(35, 7) + threadIdx_x // 7 - -(T.Mul(35, 7) + threadIdx_x // 7) // 9 - -(T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(35, 7) - -T.Mul(35, 7) + threadIdx_x // 7 - -(T.Mul(35, 7) + threadIdx_x // 7) % 9 - -(T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(35, 49) - -T.Mul(35, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(35, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(35, 7) + threadIdx_x // 7 < 576: - if T.Mul(35, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(35, 7) + threadIdx_x // 7 < 576: - if T.Mul(35, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -36 - -T.Mul(36, 7) - -T.Mul(36, 7) + threadIdx_x // 7 - -T.Mul(36, 7) + threadIdx_x // 7 < 576 - -T.Mul(36, 49) - -T.Mul(36, 49) + threadIdx_x - -T.Mul(36, 49) + threadIdx_x < 4032 - -T.Mul(36, 7) - -T.Mul(36, 7) + threadIdx_x // 7 - -(T.Mul(36, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 - -T.Mul(36, 7) - -T.Mul(36, 7) + threadIdx_x // 7 - -(T.Mul(36, 7) + threadIdx_x // 7) % 9 - -(T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(36, 7) - -T.Mul(36, 7) + threadIdx_x // 7 - -(T.Mul(36, 7) + threadIdx_x // 7) // 9 - -(T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(36, 7) - -T.Mul(36, 7) + threadIdx_x // 7 - -(T.Mul(36, 7) + threadIdx_x // 7) % 9 - -(T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(36, 49) - -T.Mul(36, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(36, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(36, 7) + threadIdx_x // 7 < 576: - if T.Mul(36, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(36, 7) + threadIdx_x // 7 < 576: - if T.Mul(36, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -37 - -T.Mul(37, 7) - -T.Mul(37, 7) + threadIdx_x // 7 - -T.Mul(37, 7) + threadIdx_x // 7 < 576 - -T.Mul(37, 49) - -T.Mul(37, 49) + threadIdx_x - -T.Mul(37, 49) + threadIdx_x < 4032 - -T.Mul(37, 7) - -T.Mul(37, 7) + threadIdx_x // 7 - -(T.Mul(37, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 - -T.Mul(37, 7) - -T.Mul(37, 7) + threadIdx_x // 7 - -(T.Mul(37, 7) + threadIdx_x // 7) % 9 - -(T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(37, 7) - -T.Mul(37, 7) + threadIdx_x // 7 - -(T.Mul(37, 7) + threadIdx_x // 7) // 9 - -(T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(37, 7) - -T.Mul(37, 7) + threadIdx_x // 7 - -(T.Mul(37, 7) + threadIdx_x // 7) % 9 - -(T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(37, 49) - -T.Mul(37, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(37, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(37, 7) + threadIdx_x // 7 < 576: - if T.Mul(37, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(37, 7) + threadIdx_x // 7 < 576: - if T.Mul(37, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -38 - -T.Mul(38, 7) - -T.Mul(38, 7) + threadIdx_x // 7 - -T.Mul(38, 7) + threadIdx_x // 7 < 576 - -T.Mul(38, 49) - -T.Mul(38, 49) + threadIdx_x - -T.Mul(38, 49) + threadIdx_x < 4032 - -T.Mul(38, 7) - -T.Mul(38, 7) + threadIdx_x // 7 - -(T.Mul(38, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 - -T.Mul(38, 7) - -T.Mul(38, 7) + threadIdx_x // 7 - -(T.Mul(38, 7) + threadIdx_x // 7) % 9 - -(T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(38, 7) - -T.Mul(38, 7) + threadIdx_x // 7 - -(T.Mul(38, 7) + threadIdx_x // 7) // 9 - -(T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(38, 7) - -T.Mul(38, 7) + threadIdx_x // 7 - -(T.Mul(38, 7) + threadIdx_x // 7) % 9 - -(T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(38, 49) - -T.Mul(38, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(38, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(38, 7) + threadIdx_x // 7 < 576: - if T.Mul(38, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(38, 7) + threadIdx_x // 7 < 576: - if T.Mul(38, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -39 - -T.Mul(39, 7) - -T.Mul(39, 7) + threadIdx_x // 7 - -T.Mul(39, 7) + threadIdx_x // 7 < 576 - -T.Mul(39, 49) - -T.Mul(39, 49) + threadIdx_x - -T.Mul(39, 49) + threadIdx_x < 4032 - -T.Mul(39, 7) - -T.Mul(39, 7) + threadIdx_x // 7 - -(T.Mul(39, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 - -T.Mul(39, 7) - -T.Mul(39, 7) + threadIdx_x // 7 - -(T.Mul(39, 7) + threadIdx_x // 7) % 9 - -(T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(39, 7) - -T.Mul(39, 7) + threadIdx_x // 7 - -(T.Mul(39, 7) + threadIdx_x // 7) // 9 - -(T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(39, 7) - -T.Mul(39, 7) + threadIdx_x // 7 - -(T.Mul(39, 7) + threadIdx_x // 7) % 9 - -(T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(39, 49) - -T.Mul(39, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(39, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(39, 7) + threadIdx_x // 7 < 576: - if T.Mul(39, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(39, 7) + threadIdx_x // 7 < 576: - if T.Mul(39, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -40 - -T.Mul(40, 7) - -T.Mul(40, 7) + threadIdx_x // 7 - -T.Mul(40, 7) + threadIdx_x // 7 < 576 - -T.Mul(40, 49) - -T.Mul(40, 49) + threadIdx_x - -T.Mul(40, 49) + threadIdx_x < 4032 - -T.Mul(40, 7) - -T.Mul(40, 7) + threadIdx_x // 7 - -(T.Mul(40, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 - -T.Mul(40, 7) - -T.Mul(40, 7) + threadIdx_x // 7 - -(T.Mul(40, 7) + threadIdx_x // 7) % 9 - -(T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(40, 7) - -T.Mul(40, 7) + threadIdx_x // 7 - -(T.Mul(40, 7) + threadIdx_x // 7) // 9 - -(T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(40, 7) - -T.Mul(40, 7) + threadIdx_x // 7 - -(T.Mul(40, 7) + threadIdx_x // 7) % 9 - -(T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(40, 49) - -T.Mul(40, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(40, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(40, 7) + threadIdx_x // 7 < 576: - if T.Mul(40, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(40, 7) + threadIdx_x // 7 < 576: - if T.Mul(40, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -41 - -T.Mul(41, 7) - -T.Mul(41, 7) + threadIdx_x // 7 - -T.Mul(41, 7) + threadIdx_x // 7 < 576 - -T.Mul(41, 49) - -T.Mul(41, 49) + threadIdx_x - -T.Mul(41, 49) + threadIdx_x < 4032 - -T.Mul(41, 7) - -T.Mul(41, 7) + threadIdx_x // 7 - -(T.Mul(41, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 - -T.Mul(41, 7) - -T.Mul(41, 7) + threadIdx_x // 7 - -(T.Mul(41, 7) + threadIdx_x // 7) % 9 - -(T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(41, 7) - -T.Mul(41, 7) + threadIdx_x // 7 - -(T.Mul(41, 7) + threadIdx_x // 7) // 9 - -(T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(41, 7) - -T.Mul(41, 7) + threadIdx_x // 7 - -(T.Mul(41, 7) + threadIdx_x // 7) % 9 - -(T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(41, 49) - -T.Mul(41, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(41, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(41, 7) + threadIdx_x // 7 < 576: - if T.Mul(41, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(41, 7) + threadIdx_x // 7 < 576: - if T.Mul(41, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -42 - -T.Mul(42, 7) - -T.Mul(42, 7) + threadIdx_x // 7 - -T.Mul(42, 7) + threadIdx_x // 7 < 576 - -T.Mul(42, 49) - -T.Mul(42, 49) + threadIdx_x - -T.Mul(42, 49) + threadIdx_x < 4032 - -T.Mul(42, 7) - -T.Mul(42, 7) + threadIdx_x // 7 - -(T.Mul(42, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 - -T.Mul(42, 7) - -T.Mul(42, 7) + threadIdx_x // 7 - -(T.Mul(42, 7) + threadIdx_x // 7) % 9 - -(T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(42, 7) - -T.Mul(42, 7) + threadIdx_x // 7 - -(T.Mul(42, 7) + threadIdx_x // 7) // 9 - -(T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(42, 7) - -T.Mul(42, 7) + threadIdx_x // 7 - -(T.Mul(42, 7) + threadIdx_x // 7) % 9 - -(T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(42, 49) - -T.Mul(42, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(42, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(42, 7) + threadIdx_x // 7 < 576: - if T.Mul(42, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(42, 7) + threadIdx_x // 7 < 576: - if T.Mul(42, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -43 - -T.Mul(43, 7) - -T.Mul(43, 7) + threadIdx_x // 7 - -T.Mul(43, 7) + threadIdx_x // 7 < 576 - -T.Mul(43, 49) - -T.Mul(43, 49) + threadIdx_x - -T.Mul(43, 49) + threadIdx_x < 4032 - -T.Mul(43, 7) - -T.Mul(43, 7) + threadIdx_x // 7 - -(T.Mul(43, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 - -T.Mul(43, 7) - -T.Mul(43, 7) + threadIdx_x // 7 - -(T.Mul(43, 7) + threadIdx_x // 7) % 9 - -(T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(43, 7) - -T.Mul(43, 7) + threadIdx_x // 7 - -(T.Mul(43, 7) + threadIdx_x // 7) // 9 - -(T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(43, 7) - -T.Mul(43, 7) + threadIdx_x // 7 - -(T.Mul(43, 7) + threadIdx_x // 7) % 9 - -(T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(43, 49) - -T.Mul(43, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(43, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(43, 7) + threadIdx_x // 7 < 576: - if T.Mul(43, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(43, 7) + threadIdx_x // 7 < 576: - if T.Mul(43, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -44 - -T.Mul(44, 7) - -T.Mul(44, 7) + threadIdx_x // 7 - -T.Mul(44, 7) + threadIdx_x // 7 < 576 - -T.Mul(44, 49) - -T.Mul(44, 49) + threadIdx_x - -T.Mul(44, 49) + threadIdx_x < 4032 - -T.Mul(44, 7) - -T.Mul(44, 7) + threadIdx_x // 7 - -(T.Mul(44, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 - -T.Mul(44, 7) - -T.Mul(44, 7) + threadIdx_x // 7 - -(T.Mul(44, 7) + threadIdx_x // 7) % 9 - -(T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(44, 7) - -T.Mul(44, 7) + threadIdx_x // 7 - -(T.Mul(44, 7) + threadIdx_x // 7) // 9 - -(T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(44, 7) - -T.Mul(44, 7) + threadIdx_x // 7 - -(T.Mul(44, 7) + threadIdx_x // 7) % 9 - -(T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(44, 49) - -T.Mul(44, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(44, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(44, 7) + threadIdx_x // 7 < 576: - if T.Mul(44, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(44, 7) + threadIdx_x // 7 < 576: - if T.Mul(44, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -45 - -T.Mul(45, 7) - -T.Mul(45, 7) + threadIdx_x // 7 - -T.Mul(45, 7) + threadIdx_x // 7 < 576 - -T.Mul(45, 49) - -T.Mul(45, 49) + threadIdx_x - -T.Mul(45, 49) + threadIdx_x < 4032 - -T.Mul(45, 7) - -T.Mul(45, 7) + threadIdx_x // 7 - -(T.Mul(45, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 - -T.Mul(45, 7) - -T.Mul(45, 7) + threadIdx_x // 7 - -(T.Mul(45, 7) + threadIdx_x // 7) % 9 - -(T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(45, 7) - -T.Mul(45, 7) + threadIdx_x // 7 - -(T.Mul(45, 7) + threadIdx_x // 7) // 9 - -(T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(45, 7) - -T.Mul(45, 7) + threadIdx_x // 7 - -(T.Mul(45, 7) + threadIdx_x // 7) % 9 - -(T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(45, 49) - -T.Mul(45, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(45, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(45, 7) + threadIdx_x // 7 < 576: - if T.Mul(45, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(45, 7) + threadIdx_x // 7 < 576: - if T.Mul(45, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -46 - -T.Mul(46, 7) - -T.Mul(46, 7) + threadIdx_x // 7 - -T.Mul(46, 7) + threadIdx_x // 7 < 576 - -T.Mul(46, 49) - -T.Mul(46, 49) + threadIdx_x - -T.Mul(46, 49) + threadIdx_x < 4032 - -T.Mul(46, 7) - -T.Mul(46, 7) + threadIdx_x // 7 - -(T.Mul(46, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 - -T.Mul(46, 7) - -T.Mul(46, 7) + threadIdx_x // 7 - -(T.Mul(46, 7) + threadIdx_x // 7) % 9 - -(T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(46, 7) - -T.Mul(46, 7) + threadIdx_x // 7 - -(T.Mul(46, 7) + threadIdx_x // 7) // 9 - -(T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(46, 7) - -T.Mul(46, 7) + threadIdx_x // 7 - -(T.Mul(46, 7) + threadIdx_x // 7) % 9 - -(T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(46, 49) - -T.Mul(46, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(46, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(46, 7) + threadIdx_x // 7 < 576: - if T.Mul(46, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(46, 7) + threadIdx_x // 7 < 576: - if T.Mul(46, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -47 - -T.Mul(47, 7) - -T.Mul(47, 7) + threadIdx_x // 7 - -T.Mul(47, 7) + threadIdx_x // 7 < 576 - -T.Mul(47, 49) - -T.Mul(47, 49) + threadIdx_x - -T.Mul(47, 49) + threadIdx_x < 4032 - -T.Mul(47, 7) - -T.Mul(47, 7) + threadIdx_x // 7 - -(T.Mul(47, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 - -T.Mul(47, 7) - -T.Mul(47, 7) + threadIdx_x // 7 - -(T.Mul(47, 7) + threadIdx_x // 7) % 9 - -(T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(47, 7) - -T.Mul(47, 7) + threadIdx_x // 7 - -(T.Mul(47, 7) + threadIdx_x // 7) // 9 - -(T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(47, 7) - -T.Mul(47, 7) + threadIdx_x // 7 - -(T.Mul(47, 7) + threadIdx_x // 7) % 9 - -(T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(47, 49) - -T.Mul(47, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(47, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(47, 7) + threadIdx_x // 7 < 576: - if T.Mul(47, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(47, 7) + threadIdx_x // 7 < 576: - if T.Mul(47, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -48 - -T.Mul(48, 7) - -T.Mul(48, 7) + threadIdx_x // 7 - -T.Mul(48, 7) + threadIdx_x // 7 < 576 - -T.Mul(48, 49) - -T.Mul(48, 49) + threadIdx_x - -T.Mul(48, 49) + threadIdx_x < 4032 - -T.Mul(48, 7) - -T.Mul(48, 7) + threadIdx_x // 7 - -(T.Mul(48, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 - -T.Mul(48, 7) - -T.Mul(48, 7) + threadIdx_x // 7 - -(T.Mul(48, 7) + threadIdx_x // 7) % 9 - -(T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(48, 7) - -T.Mul(48, 7) + threadIdx_x // 7 - -(T.Mul(48, 7) + threadIdx_x // 7) // 9 - -(T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(48, 7) - -T.Mul(48, 7) + threadIdx_x // 7 - -(T.Mul(48, 7) + threadIdx_x // 7) % 9 - -(T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(48, 49) - -T.Mul(48, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(48, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(48, 7) + threadIdx_x // 7 < 576: - if T.Mul(48, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(48, 7) + threadIdx_x // 7 < 576: - if T.Mul(48, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -49 - -T.Mul(49, 7) - -T.Mul(49, 7) + threadIdx_x // 7 - -T.Mul(49, 7) + threadIdx_x // 7 < 576 - -T.Mul(49, 49) - -T.Mul(49, 49) + threadIdx_x - -T.Mul(49, 49) + threadIdx_x < 4032 - -T.Mul(49, 7) - -T.Mul(49, 7) + threadIdx_x // 7 - -(T.Mul(49, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 - -T.Mul(49, 7) - -T.Mul(49, 7) + threadIdx_x // 7 - -(T.Mul(49, 7) + threadIdx_x // 7) % 9 - -(T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(49, 7) - -T.Mul(49, 7) + threadIdx_x // 7 - -(T.Mul(49, 7) + threadIdx_x // 7) // 9 - -(T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(49, 7) - -T.Mul(49, 7) + threadIdx_x // 7 - -(T.Mul(49, 7) + threadIdx_x // 7) % 9 - -(T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(49, 49) - -T.Mul(49, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(49, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(49, 7) + threadIdx_x // 7 < 576: - if T.Mul(49, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(49, 7) + threadIdx_x // 7 < 576: - if T.Mul(49, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -50 - -T.Mul(50, 7) - -T.Mul(50, 7) + threadIdx_x // 7 - -T.Mul(50, 7) + threadIdx_x // 7 < 576 - -T.Mul(50, 49) - -T.Mul(50, 49) + threadIdx_x - -T.Mul(50, 49) + threadIdx_x < 4032 - -T.Mul(50, 7) - -T.Mul(50, 7) + threadIdx_x // 7 - -(T.Mul(50, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 - -T.Mul(50, 7) - -T.Mul(50, 7) + threadIdx_x // 7 - -(T.Mul(50, 7) + threadIdx_x // 7) % 9 - -(T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(50, 7) - -T.Mul(50, 7) + threadIdx_x // 7 - -(T.Mul(50, 7) + threadIdx_x // 7) // 9 - -(T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(50, 7) - -T.Mul(50, 7) + threadIdx_x // 7 - -(T.Mul(50, 7) + threadIdx_x // 7) % 9 - -(T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(50, 49) - -T.Mul(50, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(50, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(50, 7) + threadIdx_x // 7 < 576: - if T.Mul(50, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(50, 7) + threadIdx_x // 7 < 576: - if T.Mul(50, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -51 - -T.Mul(51, 7) - -T.Mul(51, 7) + threadIdx_x // 7 - -T.Mul(51, 7) + threadIdx_x // 7 < 576 - -T.Mul(51, 49) - -T.Mul(51, 49) + threadIdx_x - -T.Mul(51, 49) + threadIdx_x < 4032 - -T.Mul(51, 7) - -T.Mul(51, 7) + threadIdx_x // 7 - -(T.Mul(51, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 - -T.Mul(51, 7) - -T.Mul(51, 7) + threadIdx_x // 7 - -(T.Mul(51, 7) + threadIdx_x // 7) % 9 - -(T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(51, 7) - -T.Mul(51, 7) + threadIdx_x // 7 - -(T.Mul(51, 7) + threadIdx_x // 7) // 9 - -(T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(51, 7) - -T.Mul(51, 7) + threadIdx_x // 7 - -(T.Mul(51, 7) + threadIdx_x // 7) % 9 - -(T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(51, 49) - -T.Mul(51, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(51, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(51, 7) + threadIdx_x // 7 < 576: - if T.Mul(51, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(51, 7) + threadIdx_x // 7 < 576: - if T.Mul(51, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -52 - -T.Mul(52, 7) - -T.Mul(52, 7) + threadIdx_x // 7 - -T.Mul(52, 7) + threadIdx_x // 7 < 576 - -T.Mul(52, 49) - -T.Mul(52, 49) + threadIdx_x - -T.Mul(52, 49) + threadIdx_x < 4032 - -T.Mul(52, 7) - -T.Mul(52, 7) + threadIdx_x // 7 - -(T.Mul(52, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 - -T.Mul(52, 7) - -T.Mul(52, 7) + threadIdx_x // 7 - -(T.Mul(52, 7) + threadIdx_x // 7) % 9 - -(T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(52, 7) - -T.Mul(52, 7) + threadIdx_x // 7 - -(T.Mul(52, 7) + threadIdx_x // 7) // 9 - -(T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(52, 7) - -T.Mul(52, 7) + threadIdx_x // 7 - -(T.Mul(52, 7) + threadIdx_x // 7) % 9 - -(T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(52, 49) - -T.Mul(52, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(52, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(52, 7) + threadIdx_x // 7 < 576: - if T.Mul(52, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(52, 7) + threadIdx_x // 7 < 576: - if T.Mul(52, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -53 - -T.Mul(53, 7) - -T.Mul(53, 7) + threadIdx_x // 7 - -T.Mul(53, 7) + threadIdx_x // 7 < 576 - -T.Mul(53, 49) - -T.Mul(53, 49) + threadIdx_x - -T.Mul(53, 49) + threadIdx_x < 4032 - -T.Mul(53, 7) - -T.Mul(53, 7) + threadIdx_x // 7 - -(T.Mul(53, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 - -T.Mul(53, 7) - -T.Mul(53, 7) + threadIdx_x // 7 - -(T.Mul(53, 7) + threadIdx_x // 7) % 9 - -(T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(53, 7) - -T.Mul(53, 7) + threadIdx_x // 7 - -(T.Mul(53, 7) + threadIdx_x // 7) // 9 - -(T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(53, 7) - -T.Mul(53, 7) + threadIdx_x // 7 - -(T.Mul(53, 7) + threadIdx_x // 7) % 9 - -(T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(53, 49) - -T.Mul(53, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(53, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(53, 7) + threadIdx_x // 7 < 576: - if T.Mul(53, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(53, 7) + threadIdx_x // 7 < 576: - if T.Mul(53, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -54 - -T.Mul(54, 7) - -T.Mul(54, 7) + threadIdx_x // 7 - -T.Mul(54, 7) + threadIdx_x // 7 < 576 - -T.Mul(54, 49) - -T.Mul(54, 49) + threadIdx_x - -T.Mul(54, 49) + threadIdx_x < 4032 - -T.Mul(54, 7) - -T.Mul(54, 7) + threadIdx_x // 7 - -(T.Mul(54, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 - -T.Mul(54, 7) - -T.Mul(54, 7) + threadIdx_x // 7 - -(T.Mul(54, 7) + threadIdx_x // 7) % 9 - -(T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(54, 7) - -T.Mul(54, 7) + threadIdx_x // 7 - -(T.Mul(54, 7) + threadIdx_x // 7) // 9 - -(T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(54, 7) - -T.Mul(54, 7) + threadIdx_x // 7 - -(T.Mul(54, 7) + threadIdx_x // 7) % 9 - -(T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(54, 49) - -T.Mul(54, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(54, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(54, 7) + threadIdx_x // 7 < 576: - if T.Mul(54, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(54, 7) + threadIdx_x // 7 < 576: - if T.Mul(54, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -55 - -T.Mul(55, 7) - -T.Mul(55, 7) + threadIdx_x // 7 - -T.Mul(55, 7) + threadIdx_x // 7 < 576 - -T.Mul(55, 49) - -T.Mul(55, 49) + threadIdx_x - -T.Mul(55, 49) + threadIdx_x < 4032 - -T.Mul(55, 7) - -T.Mul(55, 7) + threadIdx_x // 7 - -(T.Mul(55, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 - -T.Mul(55, 7) - -T.Mul(55, 7) + threadIdx_x // 7 - -(T.Mul(55, 7) + threadIdx_x // 7) % 9 - -(T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(55, 7) - -T.Mul(55, 7) + threadIdx_x // 7 - -(T.Mul(55, 7) + threadIdx_x // 7) // 9 - -(T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(55, 7) - -T.Mul(55, 7) + threadIdx_x // 7 - -(T.Mul(55, 7) + threadIdx_x // 7) % 9 - -(T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(55, 49) - -T.Mul(55, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(55, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(55, 7) + threadIdx_x // 7 < 576: - if T.Mul(55, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(55, 7) + threadIdx_x // 7 < 576: - if T.Mul(55, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -56 - -T.Mul(56, 7) - -T.Mul(56, 7) + threadIdx_x // 7 - -T.Mul(56, 7) + threadIdx_x // 7 < 576 - -T.Mul(56, 49) - -T.Mul(56, 49) + threadIdx_x - -T.Mul(56, 49) + threadIdx_x < 4032 - -T.Mul(56, 7) - -T.Mul(56, 7) + threadIdx_x // 7 - -(T.Mul(56, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 - -T.Mul(56, 7) - -T.Mul(56, 7) + threadIdx_x // 7 - -(T.Mul(56, 7) + threadIdx_x // 7) % 9 - -(T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(56, 7) - -T.Mul(56, 7) + threadIdx_x // 7 - -(T.Mul(56, 7) + threadIdx_x // 7) // 9 - -(T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(56, 7) - -T.Mul(56, 7) + threadIdx_x // 7 - -(T.Mul(56, 7) + threadIdx_x // 7) % 9 - -(T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(56, 49) - -T.Mul(56, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(56, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(56, 7) + threadIdx_x // 7 < 576: - if T.Mul(56, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(56, 7) + threadIdx_x // 7 < 576: - if T.Mul(56, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -57 - -T.Mul(57, 7) - -T.Mul(57, 7) + threadIdx_x // 7 - -T.Mul(57, 7) + threadIdx_x // 7 < 576 - -T.Mul(57, 49) - -T.Mul(57, 49) + threadIdx_x - -T.Mul(57, 49) + threadIdx_x < 4032 - -T.Mul(57, 7) - -T.Mul(57, 7) + threadIdx_x // 7 - -(T.Mul(57, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 - -T.Mul(57, 7) - -T.Mul(57, 7) + threadIdx_x // 7 - -(T.Mul(57, 7) + threadIdx_x // 7) % 9 - -(T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(57, 7) - -T.Mul(57, 7) + threadIdx_x // 7 - -(T.Mul(57, 7) + threadIdx_x // 7) // 9 - -(T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(57, 7) - -T.Mul(57, 7) + threadIdx_x // 7 - -(T.Mul(57, 7) + threadIdx_x // 7) % 9 - -(T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(57, 49) - -T.Mul(57, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(57, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(57, 7) + threadIdx_x // 7 < 576: - if T.Mul(57, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(57, 7) + threadIdx_x // 7 < 576: - if T.Mul(57, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -58 - -T.Mul(58, 7) - -T.Mul(58, 7) + threadIdx_x // 7 - -T.Mul(58, 7) + threadIdx_x // 7 < 576 - -T.Mul(58, 49) - -T.Mul(58, 49) + threadIdx_x - -T.Mul(58, 49) + threadIdx_x < 4032 - -T.Mul(58, 7) - -T.Mul(58, 7) + threadIdx_x // 7 - -(T.Mul(58, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 - -T.Mul(58, 7) - -T.Mul(58, 7) + threadIdx_x // 7 - -(T.Mul(58, 7) + threadIdx_x // 7) % 9 - -(T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(58, 7) - -T.Mul(58, 7) + threadIdx_x // 7 - -(T.Mul(58, 7) + threadIdx_x // 7) // 9 - -(T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(58, 7) - -T.Mul(58, 7) + threadIdx_x // 7 - -(T.Mul(58, 7) + threadIdx_x // 7) % 9 - -(T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(58, 49) - -T.Mul(58, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(58, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(58, 7) + threadIdx_x // 7 < 576: - if T.Mul(58, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(58, 7) + threadIdx_x // 7 < 576: - if T.Mul(58, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -59 - -T.Mul(59, 7) - -T.Mul(59, 7) + threadIdx_x // 7 - -T.Mul(59, 7) + threadIdx_x // 7 < 576 - -T.Mul(59, 49) - -T.Mul(59, 49) + threadIdx_x - -T.Mul(59, 49) + threadIdx_x < 4032 - -T.Mul(59, 7) - -T.Mul(59, 7) + threadIdx_x // 7 - -(T.Mul(59, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 - -T.Mul(59, 7) - -T.Mul(59, 7) + threadIdx_x // 7 - -(T.Mul(59, 7) + threadIdx_x // 7) % 9 - -(T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(59, 7) - -T.Mul(59, 7) + threadIdx_x // 7 - -(T.Mul(59, 7) + threadIdx_x // 7) // 9 - -(T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(59, 7) - -T.Mul(59, 7) + threadIdx_x // 7 - -(T.Mul(59, 7) + threadIdx_x // 7) % 9 - -(T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(59, 49) - -T.Mul(59, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(59, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(59, 7) + threadIdx_x // 7 < 576: - if T.Mul(59, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(59, 7) + threadIdx_x // 7 < 576: - if T.Mul(59, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -60 - -T.Mul(60, 7) - -T.Mul(60, 7) + threadIdx_x // 7 - -T.Mul(60, 7) + threadIdx_x // 7 < 576 - -T.Mul(60, 49) - -T.Mul(60, 49) + threadIdx_x - -T.Mul(60, 49) + threadIdx_x < 4032 - -T.Mul(60, 7) - -T.Mul(60, 7) + threadIdx_x // 7 - -(T.Mul(60, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 - -T.Mul(60, 7) - -T.Mul(60, 7) + threadIdx_x // 7 - -(T.Mul(60, 7) + threadIdx_x // 7) % 9 - -(T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(60, 7) - -T.Mul(60, 7) + threadIdx_x // 7 - -(T.Mul(60, 7) + threadIdx_x // 7) // 9 - -(T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(60, 7) - -T.Mul(60, 7) + threadIdx_x // 7 - -(T.Mul(60, 7) + threadIdx_x // 7) % 9 - -(T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(60, 49) - -T.Mul(60, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(60, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(60, 7) + threadIdx_x // 7 < 576: - if T.Mul(60, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(60, 7) + threadIdx_x // 7 < 576: - if T.Mul(60, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -61 - -T.Mul(61, 7) - -T.Mul(61, 7) + threadIdx_x // 7 - -T.Mul(61, 7) + threadIdx_x // 7 < 576 - -T.Mul(61, 49) - -T.Mul(61, 49) + threadIdx_x - -T.Mul(61, 49) + threadIdx_x < 4032 - -T.Mul(61, 7) - -T.Mul(61, 7) + threadIdx_x // 7 - -(T.Mul(61, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 - -T.Mul(61, 7) - -T.Mul(61, 7) + threadIdx_x // 7 - -(T.Mul(61, 7) + threadIdx_x // 7) % 9 - -(T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(61, 7) - -T.Mul(61, 7) + threadIdx_x // 7 - -(T.Mul(61, 7) + threadIdx_x // 7) // 9 - -(T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(61, 7) - -T.Mul(61, 7) + threadIdx_x // 7 - -(T.Mul(61, 7) + threadIdx_x // 7) % 9 - -(T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(61, 49) - -T.Mul(61, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(61, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(61, 7) + threadIdx_x // 7 < 576: - if T.Mul(61, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(61, 7) + threadIdx_x // 7 < 576: - if T.Mul(61, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -62 - -T.Mul(62, 7) - -T.Mul(62, 7) + threadIdx_x // 7 - -T.Mul(62, 7) + threadIdx_x // 7 < 576 - -T.Mul(62, 49) - -T.Mul(62, 49) + threadIdx_x - -T.Mul(62, 49) + threadIdx_x < 4032 - -T.Mul(62, 7) - -T.Mul(62, 7) + threadIdx_x // 7 - -(T.Mul(62, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 - -T.Mul(62, 7) - -T.Mul(62, 7) + threadIdx_x // 7 - -(T.Mul(62, 7) + threadIdx_x // 7) % 9 - -(T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(62, 7) - -T.Mul(62, 7) + threadIdx_x // 7 - -(T.Mul(62, 7) + threadIdx_x // 7) // 9 - -(T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(62, 7) - -T.Mul(62, 7) + threadIdx_x // 7 - -(T.Mul(62, 7) + threadIdx_x // 7) % 9 - -(T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(62, 49) - -T.Mul(62, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(62, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(62, 7) + threadIdx_x // 7 < 576: - if T.Mul(62, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(62, 7) + threadIdx_x // 7 < 576: - if T.Mul(62, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -63 - -T.Mul(63, 7) - -T.Mul(63, 7) + threadIdx_x // 7 - -T.Mul(63, 7) + threadIdx_x // 7 < 576 - -T.Mul(63, 49) - -T.Mul(63, 49) + threadIdx_x - -T.Mul(63, 49) + threadIdx_x < 4032 - -T.Mul(63, 7) - -T.Mul(63, 7) + threadIdx_x // 7 - -(T.Mul(63, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 - -T.Mul(63, 7) - -T.Mul(63, 7) + threadIdx_x // 7 - -(T.Mul(63, 7) + threadIdx_x // 7) % 9 - -(T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(63, 7) - -T.Mul(63, 7) + threadIdx_x // 7 - -(T.Mul(63, 7) + threadIdx_x // 7) // 9 - -(T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(63, 7) - -T.Mul(63, 7) + threadIdx_x // 7 - -(T.Mul(63, 7) + threadIdx_x // 7) % 9 - -(T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(63, 49) - -T.Mul(63, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(63, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(63, 7) + threadIdx_x // 7 < 576: - if T.Mul(63, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(63, 7) + threadIdx_x // 7 < 576: - if T.Mul(63, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -64 - -T.Mul(64, 7) - -T.Mul(64, 7) + threadIdx_x // 7 - -T.Mul(64, 7) + threadIdx_x // 7 < 576 - -T.Mul(64, 49) - -T.Mul(64, 49) + threadIdx_x - -T.Mul(64, 49) + threadIdx_x < 4032 - -T.Mul(64, 7) - -T.Mul(64, 7) + threadIdx_x // 7 - -(T.Mul(64, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 - -T.Mul(64, 7) - -T.Mul(64, 7) + threadIdx_x // 7 - -(T.Mul(64, 7) + threadIdx_x // 7) % 9 - -(T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(64, 7) - -T.Mul(64, 7) + threadIdx_x // 7 - -(T.Mul(64, 7) + threadIdx_x // 7) // 9 - -(T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(64, 7) - -T.Mul(64, 7) + threadIdx_x // 7 - -(T.Mul(64, 7) + threadIdx_x // 7) % 9 - -(T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(64, 49) - -T.Mul(64, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(64, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(64, 7) + threadIdx_x // 7 < 576: - if T.Mul(64, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(64, 7) + threadIdx_x // 7 < 576: - if T.Mul(64, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -65 - -T.Mul(65, 7) - -T.Mul(65, 7) + threadIdx_x // 7 - -T.Mul(65, 7) + threadIdx_x // 7 < 576 - -T.Mul(65, 49) - -T.Mul(65, 49) + threadIdx_x - -T.Mul(65, 49) + threadIdx_x < 4032 - -T.Mul(65, 7) - -T.Mul(65, 7) + threadIdx_x // 7 - -(T.Mul(65, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 - -T.Mul(65, 7) - -T.Mul(65, 7) + threadIdx_x // 7 - -(T.Mul(65, 7) + threadIdx_x // 7) % 9 - -(T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(65, 7) - -T.Mul(65, 7) + threadIdx_x // 7 - -(T.Mul(65, 7) + threadIdx_x // 7) // 9 - -(T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(65, 7) - -T.Mul(65, 7) + threadIdx_x // 7 - -(T.Mul(65, 7) + threadIdx_x // 7) % 9 - -(T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(65, 49) - -T.Mul(65, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(65, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(65, 7) + threadIdx_x // 7 < 576: - if T.Mul(65, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(65, 7) + threadIdx_x // 7 < 576: - if T.Mul(65, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -66 - -T.Mul(66, 7) - -T.Mul(66, 7) + threadIdx_x // 7 - -T.Mul(66, 7) + threadIdx_x // 7 < 576 - -T.Mul(66, 49) - -T.Mul(66, 49) + threadIdx_x - -T.Mul(66, 49) + threadIdx_x < 4032 - -T.Mul(66, 7) - -T.Mul(66, 7) + threadIdx_x // 7 - -(T.Mul(66, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 - -T.Mul(66, 7) - -T.Mul(66, 7) + threadIdx_x // 7 - -(T.Mul(66, 7) + threadIdx_x // 7) % 9 - -(T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(66, 7) - -T.Mul(66, 7) + threadIdx_x // 7 - -(T.Mul(66, 7) + threadIdx_x // 7) // 9 - -(T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(66, 7) - -T.Mul(66, 7) + threadIdx_x // 7 - -(T.Mul(66, 7) + threadIdx_x // 7) % 9 - -(T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(66, 49) - -T.Mul(66, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(66, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(66, 7) + threadIdx_x // 7 < 576: - if T.Mul(66, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(66, 7) + threadIdx_x // 7 < 576: - if T.Mul(66, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -67 - -T.Mul(67, 7) - -T.Mul(67, 7) + threadIdx_x // 7 - -T.Mul(67, 7) + threadIdx_x // 7 < 576 - -T.Mul(67, 49) - -T.Mul(67, 49) + threadIdx_x - -T.Mul(67, 49) + threadIdx_x < 4032 - -T.Mul(67, 7) - -T.Mul(67, 7) + threadIdx_x // 7 - -(T.Mul(67, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 - -T.Mul(67, 7) - -T.Mul(67, 7) + threadIdx_x // 7 - -(T.Mul(67, 7) + threadIdx_x // 7) % 9 - -(T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(67, 7) - -T.Mul(67, 7) + threadIdx_x // 7 - -(T.Mul(67, 7) + threadIdx_x // 7) // 9 - -(T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(67, 7) - -T.Mul(67, 7) + threadIdx_x // 7 - -(T.Mul(67, 7) + threadIdx_x // 7) % 9 - -(T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(67, 49) - -T.Mul(67, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(67, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(67, 7) + threadIdx_x // 7 < 576: - if T.Mul(67, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(67, 7) + threadIdx_x // 7 < 576: - if T.Mul(67, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -68 - -T.Mul(68, 7) - -T.Mul(68, 7) + threadIdx_x // 7 - -T.Mul(68, 7) + threadIdx_x // 7 < 576 - -T.Mul(68, 49) - -T.Mul(68, 49) + threadIdx_x - -T.Mul(68, 49) + threadIdx_x < 4032 - -T.Mul(68, 7) - -T.Mul(68, 7) + threadIdx_x // 7 - -(T.Mul(68, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 - -T.Mul(68, 7) - -T.Mul(68, 7) + threadIdx_x // 7 - -(T.Mul(68, 7) + threadIdx_x // 7) % 9 - -(T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(68, 7) - -T.Mul(68, 7) + threadIdx_x // 7 - -(T.Mul(68, 7) + threadIdx_x // 7) // 9 - -(T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(68, 7) - -T.Mul(68, 7) + threadIdx_x // 7 - -(T.Mul(68, 7) + threadIdx_x // 7) % 9 - -(T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(68, 49) - -T.Mul(68, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(68, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(68, 7) + threadIdx_x // 7 < 576: - if T.Mul(68, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(68, 7) + threadIdx_x // 7 < 576: - if T.Mul(68, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -69 - -T.Mul(69, 7) - -T.Mul(69, 7) + threadIdx_x // 7 - -T.Mul(69, 7) + threadIdx_x // 7 < 576 - -T.Mul(69, 49) - -T.Mul(69, 49) + threadIdx_x - -T.Mul(69, 49) + threadIdx_x < 4032 - -T.Mul(69, 7) - -T.Mul(69, 7) + threadIdx_x // 7 - -(T.Mul(69, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 - -T.Mul(69, 7) - -T.Mul(69, 7) + threadIdx_x // 7 - -(T.Mul(69, 7) + threadIdx_x // 7) % 9 - -(T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(69, 7) - -T.Mul(69, 7) + threadIdx_x // 7 - -(T.Mul(69, 7) + threadIdx_x // 7) // 9 - -(T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(69, 7) - -T.Mul(69, 7) + threadIdx_x // 7 - -(T.Mul(69, 7) + threadIdx_x // 7) % 9 - -(T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(69, 49) - -T.Mul(69, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(69, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(69, 7) + threadIdx_x // 7 < 576: - if T.Mul(69, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(69, 7) + threadIdx_x // 7 < 576: - if T.Mul(69, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -70 - -T.Mul(70, 7) - -T.Mul(70, 7) + threadIdx_x // 7 - -T.Mul(70, 7) + threadIdx_x // 7 < 576 - -T.Mul(70, 49) - -T.Mul(70, 49) + threadIdx_x - -T.Mul(70, 49) + threadIdx_x < 4032 - -T.Mul(70, 7) - -T.Mul(70, 7) + threadIdx_x // 7 - -(T.Mul(70, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 - -T.Mul(70, 7) - -T.Mul(70, 7) + threadIdx_x // 7 - -(T.Mul(70, 7) + threadIdx_x // 7) % 9 - -(T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(70, 7) - -T.Mul(70, 7) + threadIdx_x // 7 - -(T.Mul(70, 7) + threadIdx_x // 7) // 9 - -(T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(70, 7) - -T.Mul(70, 7) + threadIdx_x // 7 - -(T.Mul(70, 7) + threadIdx_x // 7) % 9 - -(T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(70, 49) - -T.Mul(70, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(70, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(70, 7) + threadIdx_x // 7 < 576: - if T.Mul(70, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(70, 7) + threadIdx_x // 7 < 576: - if T.Mul(70, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -71 - -T.Mul(71, 7) - -T.Mul(71, 7) + threadIdx_x // 7 - -T.Mul(71, 7) + threadIdx_x // 7 < 576 - -T.Mul(71, 49) - -T.Mul(71, 49) + threadIdx_x - -T.Mul(71, 49) + threadIdx_x < 4032 - -T.Mul(71, 7) - -T.Mul(71, 7) + threadIdx_x // 7 - -(T.Mul(71, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 - -T.Mul(71, 7) - -T.Mul(71, 7) + threadIdx_x // 7 - -(T.Mul(71, 7) + threadIdx_x // 7) % 9 - -(T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(71, 7) - -T.Mul(71, 7) + threadIdx_x // 7 - -(T.Mul(71, 7) + threadIdx_x // 7) // 9 - -(T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(71, 7) - -T.Mul(71, 7) + threadIdx_x // 7 - -(T.Mul(71, 7) + threadIdx_x // 7) % 9 - -(T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(71, 49) - -T.Mul(71, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(71, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(71, 7) + threadIdx_x // 7 < 576: - if T.Mul(71, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(71, 7) + threadIdx_x // 7 < 576: - if T.Mul(71, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -72 - -T.Mul(72, 7) - -T.Mul(72, 7) + threadIdx_x // 7 - -T.Mul(72, 7) + threadIdx_x // 7 < 576 - -T.Mul(72, 49) - -T.Mul(72, 49) + threadIdx_x - -T.Mul(72, 49) + threadIdx_x < 4032 - -T.Mul(72, 7) - -T.Mul(72, 7) + threadIdx_x // 7 - -(T.Mul(72, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 - -T.Mul(72, 7) - -T.Mul(72, 7) + threadIdx_x // 7 - -(T.Mul(72, 7) + threadIdx_x // 7) % 9 - -(T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(72, 7) - -T.Mul(72, 7) + threadIdx_x // 7 - -(T.Mul(72, 7) + threadIdx_x // 7) // 9 - -(T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(72, 7) - -T.Mul(72, 7) + threadIdx_x // 7 - -(T.Mul(72, 7) + threadIdx_x // 7) % 9 - -(T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(72, 49) - -T.Mul(72, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(72, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(72, 7) + threadIdx_x // 7 < 576: - if T.Mul(72, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(72, 7) + threadIdx_x // 7 < 576: - if T.Mul(72, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -73 - -T.Mul(73, 7) - -T.Mul(73, 7) + threadIdx_x // 7 - -T.Mul(73, 7) + threadIdx_x // 7 < 576 - -T.Mul(73, 49) - -T.Mul(73, 49) + threadIdx_x - -T.Mul(73, 49) + threadIdx_x < 4032 - -T.Mul(73, 7) - -T.Mul(73, 7) + threadIdx_x // 7 - -(T.Mul(73, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 - -T.Mul(73, 7) - -T.Mul(73, 7) + threadIdx_x // 7 - -(T.Mul(73, 7) + threadIdx_x // 7) % 9 - -(T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(73, 7) - -T.Mul(73, 7) + threadIdx_x // 7 - -(T.Mul(73, 7) + threadIdx_x // 7) // 9 - -(T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(73, 7) - -T.Mul(73, 7) + threadIdx_x // 7 - -(T.Mul(73, 7) + threadIdx_x // 7) % 9 - -(T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(73, 49) - -T.Mul(73, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(73, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(73, 7) + threadIdx_x // 7 < 576: - if T.Mul(73, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(73, 7) + threadIdx_x // 7 < 576: - if T.Mul(73, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -74 - -T.Mul(74, 7) - -T.Mul(74, 7) + threadIdx_x // 7 - -T.Mul(74, 7) + threadIdx_x // 7 < 576 - -T.Mul(74, 49) - -T.Mul(74, 49) + threadIdx_x - -T.Mul(74, 49) + threadIdx_x < 4032 - -T.Mul(74, 7) - -T.Mul(74, 7) + threadIdx_x // 7 - -(T.Mul(74, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 - -T.Mul(74, 7) - -T.Mul(74, 7) + threadIdx_x // 7 - -(T.Mul(74, 7) + threadIdx_x // 7) % 9 - -(T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(74, 7) - -T.Mul(74, 7) + threadIdx_x // 7 - -(T.Mul(74, 7) + threadIdx_x // 7) // 9 - -(T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(74, 7) - -T.Mul(74, 7) + threadIdx_x // 7 - -(T.Mul(74, 7) + threadIdx_x // 7) % 9 - -(T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(74, 49) - -T.Mul(74, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(74, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(74, 7) + threadIdx_x // 7 < 576: - if T.Mul(74, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(74, 7) + threadIdx_x // 7 < 576: - if T.Mul(74, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -75 - -T.Mul(75, 7) - -T.Mul(75, 7) + threadIdx_x // 7 - -T.Mul(75, 7) + threadIdx_x // 7 < 576 - -T.Mul(75, 49) - -T.Mul(75, 49) + threadIdx_x - -T.Mul(75, 49) + threadIdx_x < 4032 - -T.Mul(75, 7) - -T.Mul(75, 7) + threadIdx_x // 7 - -(T.Mul(75, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 - -T.Mul(75, 7) - -T.Mul(75, 7) + threadIdx_x // 7 - -(T.Mul(75, 7) + threadIdx_x // 7) % 9 - -(T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(75, 7) - -T.Mul(75, 7) + threadIdx_x // 7 - -(T.Mul(75, 7) + threadIdx_x // 7) // 9 - -(T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(75, 7) - -T.Mul(75, 7) + threadIdx_x // 7 - -(T.Mul(75, 7) + threadIdx_x // 7) % 9 - -(T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(75, 49) - -T.Mul(75, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(75, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(75, 7) + threadIdx_x // 7 < 576: - if T.Mul(75, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(75, 7) + threadIdx_x // 7 < 576: - if T.Mul(75, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -76 - -T.Mul(76, 7) - -T.Mul(76, 7) + threadIdx_x // 7 - -T.Mul(76, 7) + threadIdx_x // 7 < 576 - -T.Mul(76, 49) - -T.Mul(76, 49) + threadIdx_x - -T.Mul(76, 49) + threadIdx_x < 4032 - -T.Mul(76, 7) - -T.Mul(76, 7) + threadIdx_x // 7 - -(T.Mul(76, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 - -T.Mul(76, 7) - -T.Mul(76, 7) + threadIdx_x // 7 - -(T.Mul(76, 7) + threadIdx_x // 7) % 9 - -(T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(76, 7) - -T.Mul(76, 7) + threadIdx_x // 7 - -(T.Mul(76, 7) + threadIdx_x // 7) // 9 - -(T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(76, 7) - -T.Mul(76, 7) + threadIdx_x // 7 - -(T.Mul(76, 7) + threadIdx_x // 7) % 9 - -(T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(76, 49) - -T.Mul(76, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(76, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(76, 7) + threadIdx_x // 7 < 576: - if T.Mul(76, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(76, 7) + threadIdx_x // 7 < 576: - if T.Mul(76, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -77 - -T.Mul(77, 7) - -T.Mul(77, 7) + threadIdx_x // 7 - -T.Mul(77, 7) + threadIdx_x // 7 < 576 - -T.Mul(77, 49) - -T.Mul(77, 49) + threadIdx_x - -T.Mul(77, 49) + threadIdx_x < 4032 - -T.Mul(77, 7) - -T.Mul(77, 7) + threadIdx_x // 7 - -(T.Mul(77, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 - -T.Mul(77, 7) - -T.Mul(77, 7) + threadIdx_x // 7 - -(T.Mul(77, 7) + threadIdx_x // 7) % 9 - -(T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(77, 7) - -T.Mul(77, 7) + threadIdx_x // 7 - -(T.Mul(77, 7) + threadIdx_x // 7) // 9 - -(T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(77, 7) - -T.Mul(77, 7) + threadIdx_x // 7 - -(T.Mul(77, 7) + threadIdx_x // 7) % 9 - -(T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(77, 49) - -T.Mul(77, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(77, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(77, 7) + threadIdx_x // 7 < 576: - if T.Mul(77, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(77, 7) + threadIdx_x // 7 < 576: - if T.Mul(77, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -78 - -T.Mul(78, 7) - -T.Mul(78, 7) + threadIdx_x // 7 - -T.Mul(78, 7) + threadIdx_x // 7 < 576 - -T.Mul(78, 49) - -T.Mul(78, 49) + threadIdx_x - -T.Mul(78, 49) + threadIdx_x < 4032 - -T.Mul(78, 7) - -T.Mul(78, 7) + threadIdx_x // 7 - -(T.Mul(78, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 - -T.Mul(78, 7) - -T.Mul(78, 7) + threadIdx_x // 7 - -(T.Mul(78, 7) + threadIdx_x // 7) % 9 - -(T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(78, 7) - -T.Mul(78, 7) + threadIdx_x // 7 - -(T.Mul(78, 7) + threadIdx_x // 7) // 9 - -(T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(78, 7) - -T.Mul(78, 7) + threadIdx_x // 7 - -(T.Mul(78, 7) + threadIdx_x // 7) % 9 - -(T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(78, 49) - -T.Mul(78, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(78, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(78, 7) + threadIdx_x // 7 < 576: - if T.Mul(78, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(78, 7) + threadIdx_x // 7 < 576: - if T.Mul(78, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -79 - -T.Mul(79, 7) - -T.Mul(79, 7) + threadIdx_x // 7 - -T.Mul(79, 7) + threadIdx_x // 7 < 576 - -T.Mul(79, 49) - -T.Mul(79, 49) + threadIdx_x - -T.Mul(79, 49) + threadIdx_x < 4032 - -T.Mul(79, 7) - -T.Mul(79, 7) + threadIdx_x // 7 - -(T.Mul(79, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 - -T.Mul(79, 7) - -T.Mul(79, 7) + threadIdx_x // 7 - -(T.Mul(79, 7) + threadIdx_x // 7) % 9 - -(T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(79, 7) - -T.Mul(79, 7) + threadIdx_x // 7 - -(T.Mul(79, 7) + threadIdx_x // 7) // 9 - -(T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(79, 7) - -T.Mul(79, 7) + threadIdx_x // 7 - -(T.Mul(79, 7) + threadIdx_x // 7) % 9 - -(T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(79, 49) - -T.Mul(79, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(79, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(79, 7) + threadIdx_x // 7 < 576: - if T.Mul(79, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(79, 7) + threadIdx_x // 7 < 576: - if T.Mul(79, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -80 - -T.Mul(80, 7) - -T.Mul(80, 7) + threadIdx_x // 7 - -T.Mul(80, 7) + threadIdx_x // 7 < 576 - -T.Mul(80, 49) - -T.Mul(80, 49) + threadIdx_x - -T.Mul(80, 49) + threadIdx_x < 4032 - -T.Mul(80, 7) - -T.Mul(80, 7) + threadIdx_x // 7 - -(T.Mul(80, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 - -T.Mul(80, 7) - -T.Mul(80, 7) + threadIdx_x // 7 - -(T.Mul(80, 7) + threadIdx_x // 7) % 9 - -(T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(80, 7) - -T.Mul(80, 7) + threadIdx_x // 7 - -(T.Mul(80, 7) + threadIdx_x // 7) // 9 - -(T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(80, 7) - -T.Mul(80, 7) + threadIdx_x // 7 - -(T.Mul(80, 7) + threadIdx_x // 7) % 9 - -(T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(80, 49) - -T.Mul(80, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(80, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(80, 7) + threadIdx_x // 7 < 576: - if T.Mul(80, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(80, 7) + threadIdx_x // 7 < 576: - if T.Mul(80, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -81 - -T.Mul(81, 7) - -T.Mul(81, 7) + threadIdx_x // 7 - -T.Mul(81, 7) + threadIdx_x // 7 < 576 - -T.Mul(81, 49) - -T.Mul(81, 49) + threadIdx_x - -T.Mul(81, 49) + threadIdx_x < 4032 - -T.Mul(81, 7) - -T.Mul(81, 7) + threadIdx_x // 7 - -(T.Mul(81, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 - -T.Mul(81, 7) - -T.Mul(81, 7) + threadIdx_x // 7 - -(T.Mul(81, 7) + threadIdx_x // 7) % 9 - -(T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(81, 7) - -T.Mul(81, 7) + threadIdx_x // 7 - -(T.Mul(81, 7) + threadIdx_x // 7) // 9 - -(T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(81, 7) - -T.Mul(81, 7) + threadIdx_x // 7 - -(T.Mul(81, 7) + threadIdx_x // 7) % 9 - -(T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(81, 49) - -T.Mul(81, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(81, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(81, 7) + threadIdx_x // 7 < 576: - if T.Mul(81, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(81, 7) + threadIdx_x // 7 < 576: - if T.Mul(81, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -82 - -T.Mul(82, 7) - -T.Mul(82, 7) + threadIdx_x // 7 - -T.Mul(82, 7) + threadIdx_x // 7 < 576 - -T.Mul(82, 49) - -T.Mul(82, 49) + threadIdx_x - -T.Mul(82, 49) + threadIdx_x < 4032 - -T.Mul(82, 7) - -T.Mul(82, 7) + threadIdx_x // 7 - -(T.Mul(82, 7) + threadIdx_x // 7) % 9 - -1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 - -T.Mul(82, 7) - -T.Mul(82, 7) + threadIdx_x // 7 - -(T.Mul(82, 7) + threadIdx_x // 7) % 9 - -(T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 - -1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -T.Mul(82, 7) - -T.Mul(82, 7) + threadIdx_x // 7 - -(T.Mul(82, 7) + threadIdx_x // 7) // 9 - -(T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 - -rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 - -T.Mul(82, 7) - -T.Mul(82, 7) + threadIdx_x // 7 - -(T.Mul(82, 7) + threadIdx_x // 7) % 9 - -(T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 - -rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer - -rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -T.Mul(82, 49) - -T.Mul(82, 49) + threadIdx_x - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(82, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -threadIdx_x = T.int32() -if T.Mul(82, 7) + threadIdx_x // 7 < 576: - if T.Mul(82, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(82, 7) + threadIdx_x // 7 < 576: - if T.Mul(82, 49) + threadIdx_x < 4032: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -49 - -0 - -49 - -T.Mul(0, 49) - -threadIdx_x - -T.Mul(0, 49) + threadIdx_x - -1536 - -T.Mul(0, 49) + threadIdx_x < 1536 - -blockIdx_x - -36864 - -blockIdx_x * 36864 - -49 - -T.Mul(0, 49) - -T.Mul(0, 49) + threadIdx_x - -192 - -(T.Mul(0, 49) + threadIdx_x) // 192 - -4608 - -(T.Mul(0, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -49 - -T.Mul(0, 49) - -T.Mul(0, 49) + threadIdx_x - -192 - -(T.Mul(0, 49) + threadIdx_x) % 192 - -3 - -(T.Mul(0, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -49 - -T.Mul(0, 49) - -T.Mul(0, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(0, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(0, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(0, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(0, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(0, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -1 - -T.Mul(1, 49) - -T.Mul(1, 49) + threadIdx_x - -T.Mul(1, 49) + threadIdx_x < 1536 - -T.Mul(1, 49) - -T.Mul(1, 49) + threadIdx_x - -(T.Mul(1, 49) + threadIdx_x) // 192 - -(T.Mul(1, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(1, 49) - -T.Mul(1, 49) + threadIdx_x - -(T.Mul(1, 49) + threadIdx_x) % 192 - -(T.Mul(1, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(1, 49) - -T.Mul(1, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(1, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(1, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(1, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(1, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(1, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -2 - -T.Mul(2, 49) - -T.Mul(2, 49) + threadIdx_x - -T.Mul(2, 49) + threadIdx_x < 1536 - -T.Mul(2, 49) - -T.Mul(2, 49) + threadIdx_x - -(T.Mul(2, 49) + threadIdx_x) // 192 - -(T.Mul(2, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(2, 49) - -T.Mul(2, 49) + threadIdx_x - -(T.Mul(2, 49) + threadIdx_x) % 192 - -(T.Mul(2, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(2, 49) - -T.Mul(2, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(2, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(2, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(2, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(2, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(2, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -3 - -T.Mul(3, 49) - -T.Mul(3, 49) + threadIdx_x - -T.Mul(3, 49) + threadIdx_x < 1536 - -T.Mul(3, 49) - -T.Mul(3, 49) + threadIdx_x - -(T.Mul(3, 49) + threadIdx_x) // 192 - -(T.Mul(3, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(3, 49) - -T.Mul(3, 49) + threadIdx_x - -(T.Mul(3, 49) + threadIdx_x) % 192 - -(T.Mul(3, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(3, 49) - -T.Mul(3, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(3, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(3, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(3, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(3, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(3, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -4 - -T.Mul(4, 49) - -T.Mul(4, 49) + threadIdx_x - -T.Mul(4, 49) + threadIdx_x < 1536 - -T.Mul(4, 49) - -T.Mul(4, 49) + threadIdx_x - -(T.Mul(4, 49) + threadIdx_x) // 192 - -(T.Mul(4, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(4, 49) - -T.Mul(4, 49) + threadIdx_x - -(T.Mul(4, 49) + threadIdx_x) % 192 - -(T.Mul(4, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(4, 49) - -T.Mul(4, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(4, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(4, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(4, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(4, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(4, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -5 - -T.Mul(5, 49) - -T.Mul(5, 49) + threadIdx_x - -T.Mul(5, 49) + threadIdx_x < 1536 - -T.Mul(5, 49) - -T.Mul(5, 49) + threadIdx_x - -(T.Mul(5, 49) + threadIdx_x) // 192 - -(T.Mul(5, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(5, 49) - -T.Mul(5, 49) + threadIdx_x - -(T.Mul(5, 49) + threadIdx_x) % 192 - -(T.Mul(5, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(5, 49) - -T.Mul(5, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(5, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(5, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(5, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(5, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(5, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -6 - -T.Mul(6, 49) - -T.Mul(6, 49) + threadIdx_x - -T.Mul(6, 49) + threadIdx_x < 1536 - -T.Mul(6, 49) - -T.Mul(6, 49) + threadIdx_x - -(T.Mul(6, 49) + threadIdx_x) // 192 - -(T.Mul(6, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(6, 49) - -T.Mul(6, 49) + threadIdx_x - -(T.Mul(6, 49) + threadIdx_x) % 192 - -(T.Mul(6, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(6, 49) - -T.Mul(6, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(6, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(6, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(6, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(6, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(6, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -7 - -T.Mul(7, 49) - -T.Mul(7, 49) + threadIdx_x - -T.Mul(7, 49) + threadIdx_x < 1536 - -T.Mul(7, 49) - -T.Mul(7, 49) + threadIdx_x - -(T.Mul(7, 49) + threadIdx_x) // 192 - -(T.Mul(7, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(7, 49) - -T.Mul(7, 49) + threadIdx_x - -(T.Mul(7, 49) + threadIdx_x) % 192 - -(T.Mul(7, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(7, 49) - -T.Mul(7, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(7, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(7, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(7, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(7, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(7, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -8 - -T.Mul(8, 49) - -T.Mul(8, 49) + threadIdx_x - -T.Mul(8, 49) + threadIdx_x < 1536 - -T.Mul(8, 49) - -T.Mul(8, 49) + threadIdx_x - -(T.Mul(8, 49) + threadIdx_x) // 192 - -(T.Mul(8, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(8, 49) - -T.Mul(8, 49) + threadIdx_x - -(T.Mul(8, 49) + threadIdx_x) % 192 - -(T.Mul(8, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(8, 49) - -T.Mul(8, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(8, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(8, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(8, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(8, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(8, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -9 - -T.Mul(9, 49) - -T.Mul(9, 49) + threadIdx_x - -T.Mul(9, 49) + threadIdx_x < 1536 - -T.Mul(9, 49) - -T.Mul(9, 49) + threadIdx_x - -(T.Mul(9, 49) + threadIdx_x) // 192 - -(T.Mul(9, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(9, 49) - -T.Mul(9, 49) + threadIdx_x - -(T.Mul(9, 49) + threadIdx_x) % 192 - -(T.Mul(9, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(9, 49) - -T.Mul(9, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(9, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(9, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(9, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(9, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(9, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -10 - -T.Mul(10, 49) - -T.Mul(10, 49) + threadIdx_x - -T.Mul(10, 49) + threadIdx_x < 1536 - -T.Mul(10, 49) - -T.Mul(10, 49) + threadIdx_x - -(T.Mul(10, 49) + threadIdx_x) // 192 - -(T.Mul(10, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(10, 49) - -T.Mul(10, 49) + threadIdx_x - -(T.Mul(10, 49) + threadIdx_x) % 192 - -(T.Mul(10, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(10, 49) - -T.Mul(10, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(10, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(10, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(10, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(10, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(10, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -11 - -T.Mul(11, 49) - -T.Mul(11, 49) + threadIdx_x - -T.Mul(11, 49) + threadIdx_x < 1536 - -T.Mul(11, 49) - -T.Mul(11, 49) + threadIdx_x - -(T.Mul(11, 49) + threadIdx_x) // 192 - -(T.Mul(11, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(11, 49) - -T.Mul(11, 49) + threadIdx_x - -(T.Mul(11, 49) + threadIdx_x) % 192 - -(T.Mul(11, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(11, 49) - -T.Mul(11, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(11, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(11, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(11, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(11, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(11, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -12 - -T.Mul(12, 49) - -T.Mul(12, 49) + threadIdx_x - -T.Mul(12, 49) + threadIdx_x < 1536 - -T.Mul(12, 49) - -T.Mul(12, 49) + threadIdx_x - -(T.Mul(12, 49) + threadIdx_x) // 192 - -(T.Mul(12, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(12, 49) - -T.Mul(12, 49) + threadIdx_x - -(T.Mul(12, 49) + threadIdx_x) % 192 - -(T.Mul(12, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(12, 49) - -T.Mul(12, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(12, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(12, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(12, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(12, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(12, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -13 - -T.Mul(13, 49) - -T.Mul(13, 49) + threadIdx_x - -T.Mul(13, 49) + threadIdx_x < 1536 - -T.Mul(13, 49) - -T.Mul(13, 49) + threadIdx_x - -(T.Mul(13, 49) + threadIdx_x) // 192 - -(T.Mul(13, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(13, 49) - -T.Mul(13, 49) + threadIdx_x - -(T.Mul(13, 49) + threadIdx_x) % 192 - -(T.Mul(13, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(13, 49) - -T.Mul(13, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(13, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(13, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(13, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(13, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(13, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -14 - -T.Mul(14, 49) - -T.Mul(14, 49) + threadIdx_x - -T.Mul(14, 49) + threadIdx_x < 1536 - -T.Mul(14, 49) - -T.Mul(14, 49) + threadIdx_x - -(T.Mul(14, 49) + threadIdx_x) // 192 - -(T.Mul(14, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(14, 49) - -T.Mul(14, 49) + threadIdx_x - -(T.Mul(14, 49) + threadIdx_x) % 192 - -(T.Mul(14, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(14, 49) - -T.Mul(14, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(14, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(14, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(14, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(14, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(14, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -15 - -T.Mul(15, 49) - -T.Mul(15, 49) + threadIdx_x - -T.Mul(15, 49) + threadIdx_x < 1536 - -T.Mul(15, 49) - -T.Mul(15, 49) + threadIdx_x - -(T.Mul(15, 49) + threadIdx_x) // 192 - -(T.Mul(15, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(15, 49) - -T.Mul(15, 49) + threadIdx_x - -(T.Mul(15, 49) + threadIdx_x) % 192 - -(T.Mul(15, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(15, 49) - -T.Mul(15, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(15, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(15, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(15, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(15, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(15, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -16 - -T.Mul(16, 49) - -T.Mul(16, 49) + threadIdx_x - -T.Mul(16, 49) + threadIdx_x < 1536 - -T.Mul(16, 49) - -T.Mul(16, 49) + threadIdx_x - -(T.Mul(16, 49) + threadIdx_x) // 192 - -(T.Mul(16, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(16, 49) - -T.Mul(16, 49) + threadIdx_x - -(T.Mul(16, 49) + threadIdx_x) % 192 - -(T.Mul(16, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(16, 49) - -T.Mul(16, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(16, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(16, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(16, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(16, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(16, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -17 - -T.Mul(17, 49) - -T.Mul(17, 49) + threadIdx_x - -T.Mul(17, 49) + threadIdx_x < 1536 - -T.Mul(17, 49) - -T.Mul(17, 49) + threadIdx_x - -(T.Mul(17, 49) + threadIdx_x) // 192 - -(T.Mul(17, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(17, 49) - -T.Mul(17, 49) + threadIdx_x - -(T.Mul(17, 49) + threadIdx_x) % 192 - -(T.Mul(17, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(17, 49) - -T.Mul(17, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(17, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(17, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(17, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(17, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(17, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -18 - -T.Mul(18, 49) - -T.Mul(18, 49) + threadIdx_x - -T.Mul(18, 49) + threadIdx_x < 1536 - -T.Mul(18, 49) - -T.Mul(18, 49) + threadIdx_x - -(T.Mul(18, 49) + threadIdx_x) // 192 - -(T.Mul(18, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(18, 49) - -T.Mul(18, 49) + threadIdx_x - -(T.Mul(18, 49) + threadIdx_x) % 192 - -(T.Mul(18, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(18, 49) - -T.Mul(18, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(18, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(18, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(18, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(18, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(18, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -19 - -T.Mul(19, 49) - -T.Mul(19, 49) + threadIdx_x - -T.Mul(19, 49) + threadIdx_x < 1536 - -T.Mul(19, 49) - -T.Mul(19, 49) + threadIdx_x - -(T.Mul(19, 49) + threadIdx_x) // 192 - -(T.Mul(19, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(19, 49) - -T.Mul(19, 49) + threadIdx_x - -(T.Mul(19, 49) + threadIdx_x) % 192 - -(T.Mul(19, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(19, 49) - -T.Mul(19, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(19, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(19, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(19, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(19, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(19, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -20 - -T.Mul(20, 49) - -T.Mul(20, 49) + threadIdx_x - -T.Mul(20, 49) + threadIdx_x < 1536 - -T.Mul(20, 49) - -T.Mul(20, 49) + threadIdx_x - -(T.Mul(20, 49) + threadIdx_x) // 192 - -(T.Mul(20, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(20, 49) - -T.Mul(20, 49) + threadIdx_x - -(T.Mul(20, 49) + threadIdx_x) % 192 - -(T.Mul(20, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(20, 49) - -T.Mul(20, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(20, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(20, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(20, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(20, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(20, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -21 - -T.Mul(21, 49) - -T.Mul(21, 49) + threadIdx_x - -T.Mul(21, 49) + threadIdx_x < 1536 - -T.Mul(21, 49) - -T.Mul(21, 49) + threadIdx_x - -(T.Mul(21, 49) + threadIdx_x) // 192 - -(T.Mul(21, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(21, 49) - -T.Mul(21, 49) + threadIdx_x - -(T.Mul(21, 49) + threadIdx_x) % 192 - -(T.Mul(21, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(21, 49) - -T.Mul(21, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(21, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(21, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(21, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(21, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(21, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -22 - -T.Mul(22, 49) - -T.Mul(22, 49) + threadIdx_x - -T.Mul(22, 49) + threadIdx_x < 1536 - -T.Mul(22, 49) - -T.Mul(22, 49) + threadIdx_x - -(T.Mul(22, 49) + threadIdx_x) // 192 - -(T.Mul(22, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(22, 49) - -T.Mul(22, 49) + threadIdx_x - -(T.Mul(22, 49) + threadIdx_x) % 192 - -(T.Mul(22, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(22, 49) - -T.Mul(22, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(22, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(22, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(22, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(22, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(22, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -23 - -T.Mul(23, 49) - -T.Mul(23, 49) + threadIdx_x - -T.Mul(23, 49) + threadIdx_x < 1536 - -T.Mul(23, 49) - -T.Mul(23, 49) + threadIdx_x - -(T.Mul(23, 49) + threadIdx_x) // 192 - -(T.Mul(23, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(23, 49) - -T.Mul(23, 49) + threadIdx_x - -(T.Mul(23, 49) + threadIdx_x) % 192 - -(T.Mul(23, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(23, 49) - -T.Mul(23, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(23, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(23, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(23, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(23, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(23, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -24 - -T.Mul(24, 49) - -T.Mul(24, 49) + threadIdx_x - -T.Mul(24, 49) + threadIdx_x < 1536 - -T.Mul(24, 49) - -T.Mul(24, 49) + threadIdx_x - -(T.Mul(24, 49) + threadIdx_x) // 192 - -(T.Mul(24, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(24, 49) - -T.Mul(24, 49) + threadIdx_x - -(T.Mul(24, 49) + threadIdx_x) % 192 - -(T.Mul(24, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(24, 49) - -T.Mul(24, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(24, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(24, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(24, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(24, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(24, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -25 - -T.Mul(25, 49) - -T.Mul(25, 49) + threadIdx_x - -T.Mul(25, 49) + threadIdx_x < 1536 - -T.Mul(25, 49) - -T.Mul(25, 49) + threadIdx_x - -(T.Mul(25, 49) + threadIdx_x) // 192 - -(T.Mul(25, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(25, 49) - -T.Mul(25, 49) + threadIdx_x - -(T.Mul(25, 49) + threadIdx_x) % 192 - -(T.Mul(25, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(25, 49) - -T.Mul(25, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(25, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(25, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(25, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(25, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(25, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -26 - -T.Mul(26, 49) - -T.Mul(26, 49) + threadIdx_x - -T.Mul(26, 49) + threadIdx_x < 1536 - -T.Mul(26, 49) - -T.Mul(26, 49) + threadIdx_x - -(T.Mul(26, 49) + threadIdx_x) // 192 - -(T.Mul(26, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(26, 49) - -T.Mul(26, 49) + threadIdx_x - -(T.Mul(26, 49) + threadIdx_x) % 192 - -(T.Mul(26, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(26, 49) - -T.Mul(26, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(26, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(26, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(26, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(26, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(26, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -27 - -T.Mul(27, 49) - -T.Mul(27, 49) + threadIdx_x - -T.Mul(27, 49) + threadIdx_x < 1536 - -T.Mul(27, 49) - -T.Mul(27, 49) + threadIdx_x - -(T.Mul(27, 49) + threadIdx_x) // 192 - -(T.Mul(27, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(27, 49) - -T.Mul(27, 49) + threadIdx_x - -(T.Mul(27, 49) + threadIdx_x) % 192 - -(T.Mul(27, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(27, 49) - -T.Mul(27, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(27, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(27, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(27, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(27, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(27, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -28 - -T.Mul(28, 49) - -T.Mul(28, 49) + threadIdx_x - -T.Mul(28, 49) + threadIdx_x < 1536 - -T.Mul(28, 49) - -T.Mul(28, 49) + threadIdx_x - -(T.Mul(28, 49) + threadIdx_x) // 192 - -(T.Mul(28, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(28, 49) - -T.Mul(28, 49) + threadIdx_x - -(T.Mul(28, 49) + threadIdx_x) % 192 - -(T.Mul(28, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(28, 49) - -T.Mul(28, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(28, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(28, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(28, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(28, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(28, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -29 - -T.Mul(29, 49) - -T.Mul(29, 49) + threadIdx_x - -T.Mul(29, 49) + threadIdx_x < 1536 - -T.Mul(29, 49) - -T.Mul(29, 49) + threadIdx_x - -(T.Mul(29, 49) + threadIdx_x) // 192 - -(T.Mul(29, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(29, 49) - -T.Mul(29, 49) + threadIdx_x - -(T.Mul(29, 49) + threadIdx_x) % 192 - -(T.Mul(29, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(29, 49) - -T.Mul(29, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(29, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(29, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(29, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(29, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(29, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -30 - -T.Mul(30, 49) - -T.Mul(30, 49) + threadIdx_x - -T.Mul(30, 49) + threadIdx_x < 1536 - -T.Mul(30, 49) - -T.Mul(30, 49) + threadIdx_x - -(T.Mul(30, 49) + threadIdx_x) // 192 - -(T.Mul(30, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(30, 49) - -T.Mul(30, 49) + threadIdx_x - -(T.Mul(30, 49) + threadIdx_x) % 192 - -(T.Mul(30, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(30, 49) - -T.Mul(30, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(30, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(30, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(30, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(30, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(30, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -31 - -T.Mul(31, 49) - -T.Mul(31, 49) + threadIdx_x - -T.Mul(31, 49) + threadIdx_x < 1536 - -T.Mul(31, 49) - -T.Mul(31, 49) + threadIdx_x - -(T.Mul(31, 49) + threadIdx_x) // 192 - -(T.Mul(31, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 - -blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 - -T.Mul(31, 49) - -T.Mul(31, 49) + threadIdx_x - -(T.Mul(31, 49) + threadIdx_x) % 192 - -(T.Mul(31, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 - -blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -T.Mul(31, 49) - -T.Mul(31, 49) + threadIdx_x - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[T.Mul(31, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -threadIdx_x = T.int32() -if T.Mul(31, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(31, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if T.Mul(31, 49) + threadIdx_x < 1536: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[T.Mul(31, 49) + threadIdx_x] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x) % 192 * 3 + rx_outer_outer] - -0 - -8 - -0 - -4 - -T.Mul(0, 4) - -0 - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] - -rc_outer_inner - -504 - -rc_outer_inner * 504 - -0 - -63 - -T.Mul(0, 63) - -rc_outer_inner * 504 + T.Mul(0, 63) - -0 - -7 - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) - -threadIdx_x - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] - -768 - -T.Mul(0, 768) - -192 - -T.Mul(0, 192) - -T.Mul(0, 768) + T.Mul(0, 192) - -24 - -rc_outer_inner * 24 - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -3 - -T.Mul(0, 3) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) - -T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -4 - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(0, 4) - -1 - -T.Mul(0, 4) + 1 - -conv2d_nchw[T.Mul(0, 4) + 1] - -T.Mul(0, 63) - -rc_outer_inner * 504 + T.Mul(0, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(1, 192) - -T.Mul(0, 768) + T.Mul(1, 192) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(0, 3) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) - -T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(0, 4) - -2 - -T.Mul(0, 4) + 2 - -conv2d_nchw[T.Mul(0, 4) + 2] - -T.Mul(0, 63) - -rc_outer_inner * 504 + T.Mul(0, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(2, 192) - -T.Mul(0, 768) + T.Mul(2, 192) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(0, 3) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) - -T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(0, 4) - -3 - -T.Mul(0, 4) + 3 - -conv2d_nchw[T.Mul(0, 4) + 3] - -T.Mul(0, 63) - -rc_outer_inner * 504 + T.Mul(0, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(3, 192) - -T.Mul(0, 768) + T.Mul(3, 192) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(0, 3) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) - -T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] - -1 - -T.Mul(1, 63) - -rc_outer_inner * 504 + T.Mul(1, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(0, 192) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(1, 3) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) - -T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw[T.Mul(0, 4) + 1] - -T.Mul(1, 63) - -rc_outer_inner * 504 + T.Mul(1, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(1, 192) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(1, 3) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) - -T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw[T.Mul(0, 4) + 2] - -T.Mul(1, 63) - -rc_outer_inner * 504 + T.Mul(1, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(2, 192) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(1, 3) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) - -T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw[T.Mul(0, 4) + 3] - -T.Mul(1, 63) - -rc_outer_inner * 504 + T.Mul(1, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(3, 192) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(1, 3) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) - -T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] - -2 - -T.Mul(2, 63) - -rc_outer_inner * 504 + T.Mul(2, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(0, 192) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(2, 3) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) - -T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw[T.Mul(0, 4) + 1] - -T.Mul(2, 63) - -rc_outer_inner * 504 + T.Mul(2, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(1, 192) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(2, 3) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) - -T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw[T.Mul(0, 4) + 2] - -T.Mul(2, 63) - -rc_outer_inner * 504 + T.Mul(2, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(2, 192) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(2, 3) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) - -T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw[T.Mul(0, 4) + 3] - -T.Mul(2, 63) - -rc_outer_inner * 504 + T.Mul(2, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(3, 192) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(2, 3) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) - -T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] - -3 - -T.Mul(3, 63) - -rc_outer_inner * 504 + T.Mul(3, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(0, 192) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(3, 3) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) - -T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw[T.Mul(0, 4) + 1] - -T.Mul(3, 63) - -rc_outer_inner * 504 + T.Mul(3, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(1, 192) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(3, 3) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) - -T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw[T.Mul(0, 4) + 2] - -T.Mul(3, 63) - -rc_outer_inner * 504 + T.Mul(3, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(2, 192) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(3, 3) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) - -T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw[T.Mul(0, 4) + 3] - -T.Mul(3, 63) - -rc_outer_inner * 504 + T.Mul(3, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(3, 192) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(3, 3) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) - -T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] - -4 - -T.Mul(4, 63) - -rc_outer_inner * 504 + T.Mul(4, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(0, 192) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(4, 3) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) - -T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw[T.Mul(0, 4) + 1] - -T.Mul(4, 63) - -rc_outer_inner * 504 + T.Mul(4, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(1, 192) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(4, 3) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) - -T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw[T.Mul(0, 4) + 2] - -T.Mul(4, 63) - -rc_outer_inner * 504 + T.Mul(4, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(2, 192) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(4, 3) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) - -T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw[T.Mul(0, 4) + 3] - -T.Mul(4, 63) - -rc_outer_inner * 504 + T.Mul(4, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(3, 192) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(4, 3) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) - -T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] - -5 - -T.Mul(5, 63) - -rc_outer_inner * 504 + T.Mul(5, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(0, 192) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(5, 3) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) - -T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw[T.Mul(0, 4) + 1] - -T.Mul(5, 63) - -rc_outer_inner * 504 + T.Mul(5, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(1, 192) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(5, 3) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) - -T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw[T.Mul(0, 4) + 2] - -T.Mul(5, 63) - -rc_outer_inner * 504 + T.Mul(5, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(2, 192) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(5, 3) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) - -T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw[T.Mul(0, 4) + 3] - -T.Mul(5, 63) - -rc_outer_inner * 504 + T.Mul(5, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(3, 192) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(5, 3) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) - -T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] - -6 - -T.Mul(6, 63) - -rc_outer_inner * 504 + T.Mul(6, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(0, 192) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(6, 3) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) - -T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw[T.Mul(0, 4) + 1] - -T.Mul(6, 63) - -rc_outer_inner * 504 + T.Mul(6, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(1, 192) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(6, 3) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) - -T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw[T.Mul(0, 4) + 2] - -T.Mul(6, 63) - -rc_outer_inner * 504 + T.Mul(6, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(2, 192) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(6, 3) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) - -T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw[T.Mul(0, 4) + 3] - -T.Mul(6, 63) - -rc_outer_inner * 504 + T.Mul(6, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(3, 192) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(6, 3) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) - -T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] - -7 - -T.Mul(7, 63) - -rc_outer_inner * 504 + T.Mul(7, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(0, 192) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(7, 3) - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) - -T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(0, 4) - -T.Add(T.Mul(0, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw[T.Mul(0, 4) + 1] - -T.Mul(7, 63) - -rc_outer_inner * 504 + T.Mul(7, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(1, 192) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(7, 3) - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) - -T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw[T.Mul(0, 4) + 2] - -T.Mul(7, 63) - -rc_outer_inner * 504 + T.Mul(7, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(2, 192) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(7, 3) - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) - -T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw[T.Mul(0, 4) + 3] - -T.Mul(7, 63) - -rc_outer_inner * 504 + T.Mul(7, 63) - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(0, 768) - -T.Mul(0, 768) + T.Mul(3, 192) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(7, 3) - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) - -T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) - -kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(0, 4) - -T.Mul(0, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -1 - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(0, 192) - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) - -T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw[T.Mul(1, 4) + 1] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(1, 192) - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) - -T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw[T.Mul(1, 4) + 2] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(2, 192) - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) - -T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw[T.Mul(1, 4) + 3] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(3, 192) - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) - -T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(0, 192) - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) - -T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw[T.Mul(1, 4) + 1] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(1, 192) - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) - -T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw[T.Mul(1, 4) + 2] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(2, 192) - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) - -T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw[T.Mul(1, 4) + 3] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(3, 192) - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) - -T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(0, 192) - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) - -T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw[T.Mul(1, 4) + 1] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(1, 192) - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) - -T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw[T.Mul(1, 4) + 2] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(2, 192) - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) - -T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw[T.Mul(1, 4) + 3] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(3, 192) - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) - -T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(0, 192) - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) - -T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw[T.Mul(1, 4) + 1] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(1, 192) - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) - -T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw[T.Mul(1, 4) + 2] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(2, 192) - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) - -T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw[T.Mul(1, 4) + 3] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(3, 192) - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) - -T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(0, 192) - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) - -T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw[T.Mul(1, 4) + 1] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(1, 192) - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) - -T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw[T.Mul(1, 4) + 2] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(2, 192) - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) - -T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw[T.Mul(1, 4) + 3] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(3, 192) - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) - -T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(0, 192) - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) - -T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw[T.Mul(1, 4) + 1] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(1, 192) - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) - -T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw[T.Mul(1, 4) + 2] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(2, 192) - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) - -T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw[T.Mul(1, 4) + 3] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(3, 192) - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) - -T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(0, 192) - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) - -T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw[T.Mul(1, 4) + 1] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(1, 192) - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) - -T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw[T.Mul(1, 4) + 2] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(2, 192) - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) - -T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw[T.Mul(1, 4) + 3] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(3, 192) - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) - -T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(0, 192) - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) - -T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(1, 4) - -T.Add(T.Mul(1, 4), 0) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw[T.Mul(1, 4) + 1] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(1, 192) - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) - -T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw[T.Mul(1, 4) + 2] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(2, 192) - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) - -T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw[T.Mul(1, 4) + 3] - -T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] - -T.Mul(1, 768) - -T.Mul(1, 768) + T.Mul(3, 192) - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) - -T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0) - -kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -T.Mul(1, 4) - -T.Mul(1, 4) + 3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - -1 - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - -2 - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 - -kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) - -rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] - -T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2 - -kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] -conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] -conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -for rc_outer_inner in range(8): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - pad_temp_shared = T.Buffer((4032,), scope="shared") - threadIdx_x = T.int32() - kernel_shared = T.Buffer((1536,), scope="shared") - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -threadIdx_x = T.env_thread("threadIdx.x") -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -with T.launch_thread(threadIdx_x, 49): - if T.Mul(0, 7) + threadIdx_x // 7 < 576: - if T.Mul(0, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(1, 7) + threadIdx_x // 7 < 576: - if T.Mul(1, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(2, 7) + threadIdx_x // 7 < 576: - if T.Mul(2, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(3, 7) + threadIdx_x // 7 < 576: - if T.Mul(3, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(4, 7) + threadIdx_x // 7 < 576: - if T.Mul(4, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(5, 7) + threadIdx_x // 7 < 576: - if T.Mul(5, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(6, 7) + threadIdx_x // 7 < 576: - if T.Mul(6, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(7, 7) + threadIdx_x // 7 < 576: - if T.Mul(7, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(8, 7) + threadIdx_x // 7 < 576: - if T.Mul(8, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(9, 7) + threadIdx_x // 7 < 576: - if T.Mul(9, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(10, 7) + threadIdx_x // 7 < 576: - if T.Mul(10, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(11, 7) + threadIdx_x // 7 < 576: - if T.Mul(11, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(12, 7) + threadIdx_x // 7 < 576: - if T.Mul(12, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(13, 7) + threadIdx_x // 7 < 576: - if T.Mul(13, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(14, 7) + threadIdx_x // 7 < 576: - if T.Mul(14, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(15, 7) + threadIdx_x // 7 < 576: - if T.Mul(15, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(16, 7) + threadIdx_x // 7 < 576: - if T.Mul(16, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(17, 7) + threadIdx_x // 7 < 576: - if T.Mul(17, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(18, 7) + threadIdx_x // 7 < 576: - if T.Mul(18, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(19, 7) + threadIdx_x // 7 < 576: - if T.Mul(19, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(20, 7) + threadIdx_x // 7 < 576: - if T.Mul(20, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(21, 7) + threadIdx_x // 7 < 576: - if T.Mul(21, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(22, 7) + threadIdx_x // 7 < 576: - if T.Mul(22, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(23, 7) + threadIdx_x // 7 < 576: - if T.Mul(23, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(24, 7) + threadIdx_x // 7 < 576: - if T.Mul(24, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(25, 7) + threadIdx_x // 7 < 576: - if T.Mul(25, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(26, 7) + threadIdx_x // 7 < 576: - if T.Mul(26, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(27, 7) + threadIdx_x // 7 < 576: - if T.Mul(27, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(28, 7) + threadIdx_x // 7 < 576: - if T.Mul(28, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(29, 7) + threadIdx_x // 7 < 576: - if T.Mul(29, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(30, 7) + threadIdx_x // 7 < 576: - if T.Mul(30, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(31, 7) + threadIdx_x // 7 < 576: - if T.Mul(31, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(32, 7) + threadIdx_x // 7 < 576: - if T.Mul(32, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(33, 7) + threadIdx_x // 7 < 576: - if T.Mul(33, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(34, 7) + threadIdx_x // 7 < 576: - if T.Mul(34, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(35, 7) + threadIdx_x // 7 < 576: - if T.Mul(35, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(36, 7) + threadIdx_x // 7 < 576: - if T.Mul(36, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(37, 7) + threadIdx_x // 7 < 576: - if T.Mul(37, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(38, 7) + threadIdx_x // 7 < 576: - if T.Mul(38, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(39, 7) + threadIdx_x // 7 < 576: - if T.Mul(39, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(40, 7) + threadIdx_x // 7 < 576: - if T.Mul(40, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(41, 7) + threadIdx_x // 7 < 576: - if T.Mul(41, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(42, 7) + threadIdx_x // 7 < 576: - if T.Mul(42, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(43, 7) + threadIdx_x // 7 < 576: - if T.Mul(43, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(44, 7) + threadIdx_x // 7 < 576: - if T.Mul(44, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(45, 7) + threadIdx_x // 7 < 576: - if T.Mul(45, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(46, 7) + threadIdx_x // 7 < 576: - if T.Mul(46, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(47, 7) + threadIdx_x // 7 < 576: - if T.Mul(47, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(48, 7) + threadIdx_x // 7 < 576: - if T.Mul(48, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(49, 7) + threadIdx_x // 7 < 576: - if T.Mul(49, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(50, 7) + threadIdx_x // 7 < 576: - if T.Mul(50, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(51, 7) + threadIdx_x // 7 < 576: - if T.Mul(51, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(52, 7) + threadIdx_x // 7 < 576: - if T.Mul(52, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(53, 7) + threadIdx_x // 7 < 576: - if T.Mul(53, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(54, 7) + threadIdx_x // 7 < 576: - if T.Mul(54, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(55, 7) + threadIdx_x // 7 < 576: - if T.Mul(55, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(56, 7) + threadIdx_x // 7 < 576: - if T.Mul(56, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(57, 7) + threadIdx_x // 7 < 576: - if T.Mul(57, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(58, 7) + threadIdx_x // 7 < 576: - if T.Mul(58, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(59, 7) + threadIdx_x // 7 < 576: - if T.Mul(59, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(60, 7) + threadIdx_x // 7 < 576: - if T.Mul(60, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(61, 7) + threadIdx_x // 7 < 576: - if T.Mul(61, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(62, 7) + threadIdx_x // 7 < 576: - if T.Mul(62, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(63, 7) + threadIdx_x // 7 < 576: - if T.Mul(63, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(64, 7) + threadIdx_x // 7 < 576: - if T.Mul(64, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(65, 7) + threadIdx_x // 7 < 576: - if T.Mul(65, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(66, 7) + threadIdx_x // 7 < 576: - if T.Mul(66, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(67, 7) + threadIdx_x // 7 < 576: - if T.Mul(67, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(68, 7) + threadIdx_x // 7 < 576: - if T.Mul(68, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(69, 7) + threadIdx_x // 7 < 576: - if T.Mul(69, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(70, 7) + threadIdx_x // 7 < 576: - if T.Mul(70, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(71, 7) + threadIdx_x // 7 < 576: - if T.Mul(71, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(72, 7) + threadIdx_x // 7 < 576: - if T.Mul(72, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(73, 7) + threadIdx_x // 7 < 576: - if T.Mul(73, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(74, 7) + threadIdx_x // 7 < 576: - if T.Mul(74, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(75, 7) + threadIdx_x // 7 < 576: - if T.Mul(75, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(76, 7) + threadIdx_x // 7 < 576: - if T.Mul(76, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(77, 7) + threadIdx_x // 7 < 576: - if T.Mul(77, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(78, 7) + threadIdx_x // 7 < 576: - if T.Mul(78, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(79, 7) + threadIdx_x // 7 < 576: - if T.Mul(79, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(80, 7) + threadIdx_x // 7 < 576: - if T.Mul(80, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(81, 7) + threadIdx_x // 7 < 576: - if T.Mul(81, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if T.Mul(82, 7) + threadIdx_x // 7 < 576: - if T.Mul(82, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -threadIdx_x_1 = T.env_thread("threadIdx.x") -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(0, 49) + threadIdx_x_1 < 1536: - blockIdx_x = T.int32() - kernel_shared[T.Mul(0, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -blockIdx_x = T.int32() -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(1, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(1, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(2, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(2, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(3, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(3, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(4, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(4, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(5, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(5, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(6, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(6, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(7, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(7, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(8, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(8, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(9, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(9, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(10, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(10, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(11, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(11, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(12, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(12, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(13, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(13, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(14, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(14, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(15, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(15, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(16, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(16, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(17, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(17, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(18, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(18, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(19, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(19, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(20, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(20, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(21, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(21, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(22, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(22, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(23, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(23, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(24, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(24, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(25, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(25, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(26, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(26, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(27, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(27, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(28, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(28, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(29, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(29, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(30, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(30, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(31, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(31, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] -for rc_outer_inner in range(8): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - threadIdx_x_2 = T.int32() - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -for rx_outer_outer in range(3): - threadIdx_x = T.env_thread("threadIdx.x") - pad_temp_shared = T.Buffer((4032,), scope="shared") - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - with T.launch_thread(threadIdx_x, 49): - if T.Mul(0, 7) + threadIdx_x // 7 < 576: - if T.Mul(0, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(1, 7) + threadIdx_x // 7 < 576: - if T.Mul(1, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(2, 7) + threadIdx_x // 7 < 576: - if T.Mul(2, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(3, 7) + threadIdx_x // 7 < 576: - if T.Mul(3, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(4, 7) + threadIdx_x // 7 < 576: - if T.Mul(4, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(5, 7) + threadIdx_x // 7 < 576: - if T.Mul(5, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(6, 7) + threadIdx_x // 7 < 576: - if T.Mul(6, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(7, 7) + threadIdx_x // 7 < 576: - if T.Mul(7, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(8, 7) + threadIdx_x // 7 < 576: - if T.Mul(8, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(9, 7) + threadIdx_x // 7 < 576: - if T.Mul(9, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(10, 7) + threadIdx_x // 7 < 576: - if T.Mul(10, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(11, 7) + threadIdx_x // 7 < 576: - if T.Mul(11, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(12, 7) + threadIdx_x // 7 < 576: - if T.Mul(12, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(13, 7) + threadIdx_x // 7 < 576: - if T.Mul(13, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(14, 7) + threadIdx_x // 7 < 576: - if T.Mul(14, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(15, 7) + threadIdx_x // 7 < 576: - if T.Mul(15, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(16, 7) + threadIdx_x // 7 < 576: - if T.Mul(16, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(17, 7) + threadIdx_x // 7 < 576: - if T.Mul(17, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(18, 7) + threadIdx_x // 7 < 576: - if T.Mul(18, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(19, 7) + threadIdx_x // 7 < 576: - if T.Mul(19, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(20, 7) + threadIdx_x // 7 < 576: - if T.Mul(20, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(21, 7) + threadIdx_x // 7 < 576: - if T.Mul(21, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(22, 7) + threadIdx_x // 7 < 576: - if T.Mul(22, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(23, 7) + threadIdx_x // 7 < 576: - if T.Mul(23, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(24, 7) + threadIdx_x // 7 < 576: - if T.Mul(24, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(25, 7) + threadIdx_x // 7 < 576: - if T.Mul(25, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(26, 7) + threadIdx_x // 7 < 576: - if T.Mul(26, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(27, 7) + threadIdx_x // 7 < 576: - if T.Mul(27, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(28, 7) + threadIdx_x // 7 < 576: - if T.Mul(28, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(29, 7) + threadIdx_x // 7 < 576: - if T.Mul(29, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(30, 7) + threadIdx_x // 7 < 576: - if T.Mul(30, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(31, 7) + threadIdx_x // 7 < 576: - if T.Mul(31, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(32, 7) + threadIdx_x // 7 < 576: - if T.Mul(32, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(33, 7) + threadIdx_x // 7 < 576: - if T.Mul(33, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(34, 7) + threadIdx_x // 7 < 576: - if T.Mul(34, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(35, 7) + threadIdx_x // 7 < 576: - if T.Mul(35, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(36, 7) + threadIdx_x // 7 < 576: - if T.Mul(36, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(37, 7) + threadIdx_x // 7 < 576: - if T.Mul(37, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(38, 7) + threadIdx_x // 7 < 576: - if T.Mul(38, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(39, 7) + threadIdx_x // 7 < 576: - if T.Mul(39, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(40, 7) + threadIdx_x // 7 < 576: - if T.Mul(40, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(41, 7) + threadIdx_x // 7 < 576: - if T.Mul(41, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(42, 7) + threadIdx_x // 7 < 576: - if T.Mul(42, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(43, 7) + threadIdx_x // 7 < 576: - if T.Mul(43, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(44, 7) + threadIdx_x // 7 < 576: - if T.Mul(44, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(45, 7) + threadIdx_x // 7 < 576: - if T.Mul(45, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(46, 7) + threadIdx_x // 7 < 576: - if T.Mul(46, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(47, 7) + threadIdx_x // 7 < 576: - if T.Mul(47, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(48, 7) + threadIdx_x // 7 < 576: - if T.Mul(48, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(49, 7) + threadIdx_x // 7 < 576: - if T.Mul(49, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(50, 7) + threadIdx_x // 7 < 576: - if T.Mul(50, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(51, 7) + threadIdx_x // 7 < 576: - if T.Mul(51, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(52, 7) + threadIdx_x // 7 < 576: - if T.Mul(52, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(53, 7) + threadIdx_x // 7 < 576: - if T.Mul(53, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(54, 7) + threadIdx_x // 7 < 576: - if T.Mul(54, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(55, 7) + threadIdx_x // 7 < 576: - if T.Mul(55, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(56, 7) + threadIdx_x // 7 < 576: - if T.Mul(56, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(57, 7) + threadIdx_x // 7 < 576: - if T.Mul(57, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(58, 7) + threadIdx_x // 7 < 576: - if T.Mul(58, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(59, 7) + threadIdx_x // 7 < 576: - if T.Mul(59, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(60, 7) + threadIdx_x // 7 < 576: - if T.Mul(60, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(61, 7) + threadIdx_x // 7 < 576: - if T.Mul(61, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(62, 7) + threadIdx_x // 7 < 576: - if T.Mul(62, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(63, 7) + threadIdx_x // 7 < 576: - if T.Mul(63, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(64, 7) + threadIdx_x // 7 < 576: - if T.Mul(64, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(65, 7) + threadIdx_x // 7 < 576: - if T.Mul(65, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(66, 7) + threadIdx_x // 7 < 576: - if T.Mul(66, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(67, 7) + threadIdx_x // 7 < 576: - if T.Mul(67, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(68, 7) + threadIdx_x // 7 < 576: - if T.Mul(68, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(69, 7) + threadIdx_x // 7 < 576: - if T.Mul(69, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(70, 7) + threadIdx_x // 7 < 576: - if T.Mul(70, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(71, 7) + threadIdx_x // 7 < 576: - if T.Mul(71, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(72, 7) + threadIdx_x // 7 < 576: - if T.Mul(72, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(73, 7) + threadIdx_x // 7 < 576: - if T.Mul(73, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(74, 7) + threadIdx_x // 7 < 576: - if T.Mul(74, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(75, 7) + threadIdx_x // 7 < 576: - if T.Mul(75, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(76, 7) + threadIdx_x // 7 < 576: - if T.Mul(76, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(77, 7) + threadIdx_x // 7 < 576: - if T.Mul(77, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(78, 7) + threadIdx_x // 7 < 576: - if T.Mul(78, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(79, 7) + threadIdx_x // 7 < 576: - if T.Mul(79, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(80, 7) + threadIdx_x // 7 < 576: - if T.Mul(80, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(81, 7) + threadIdx_x // 7 < 576: - if T.Mul(81, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(82, 7) + threadIdx_x // 7 < 576: - if T.Mul(82, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - threadIdx_x_1 = T.env_thread("threadIdx.x") - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(0, 49) + threadIdx_x_1 < 1536: - blockIdx_x = T.int32() - kernel_shared[T.Mul(0, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - blockIdx_x = T.int32() - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(1, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(1, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(2, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(2, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(3, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(3, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(4, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(4, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(5, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(5, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(6, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(6, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(7, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(7, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(8, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(8, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(9, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(9, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(10, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(10, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(11, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(11, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(12, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(12, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(13, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(13, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(14, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(14, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(15, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(15, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(16, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(16, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(17, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(17, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(18, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(18, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(19, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(19, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(20, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(20, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(21, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(21, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(22, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(22, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(23, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(23, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(24, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(24, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(25, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(25, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(26, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(26, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(27, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(27, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(28, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(28, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(29, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(29, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(30, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(30, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(31, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(31, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - for rc_outer_inner in range(8): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - threadIdx_x_2 = T.int32() - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x = T.env_thread("threadIdx.x") - pad_temp_shared = T.Buffer((4032,), scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(0, 7) + threadIdx_x // 7 < 576: - if T.Mul(0, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(1, 7) + threadIdx_x // 7 < 576: - if T.Mul(1, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(2, 7) + threadIdx_x // 7 < 576: - if T.Mul(2, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(3, 7) + threadIdx_x // 7 < 576: - if T.Mul(3, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(4, 7) + threadIdx_x // 7 < 576: - if T.Mul(4, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(5, 7) + threadIdx_x // 7 < 576: - if T.Mul(5, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(6, 7) + threadIdx_x // 7 < 576: - if T.Mul(6, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(7, 7) + threadIdx_x // 7 < 576: - if T.Mul(7, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(8, 7) + threadIdx_x // 7 < 576: - if T.Mul(8, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(9, 7) + threadIdx_x // 7 < 576: - if T.Mul(9, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(10, 7) + threadIdx_x // 7 < 576: - if T.Mul(10, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(11, 7) + threadIdx_x // 7 < 576: - if T.Mul(11, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(12, 7) + threadIdx_x // 7 < 576: - if T.Mul(12, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(13, 7) + threadIdx_x // 7 < 576: - if T.Mul(13, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(14, 7) + threadIdx_x // 7 < 576: - if T.Mul(14, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(15, 7) + threadIdx_x // 7 < 576: - if T.Mul(15, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(16, 7) + threadIdx_x // 7 < 576: - if T.Mul(16, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(17, 7) + threadIdx_x // 7 < 576: - if T.Mul(17, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(18, 7) + threadIdx_x // 7 < 576: - if T.Mul(18, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(19, 7) + threadIdx_x // 7 < 576: - if T.Mul(19, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(20, 7) + threadIdx_x // 7 < 576: - if T.Mul(20, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(21, 7) + threadIdx_x // 7 < 576: - if T.Mul(21, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(22, 7) + threadIdx_x // 7 < 576: - if T.Mul(22, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(23, 7) + threadIdx_x // 7 < 576: - if T.Mul(23, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(24, 7) + threadIdx_x // 7 < 576: - if T.Mul(24, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(25, 7) + threadIdx_x // 7 < 576: - if T.Mul(25, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(26, 7) + threadIdx_x // 7 < 576: - if T.Mul(26, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(27, 7) + threadIdx_x // 7 < 576: - if T.Mul(27, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(28, 7) + threadIdx_x // 7 < 576: - if T.Mul(28, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(29, 7) + threadIdx_x // 7 < 576: - if T.Mul(29, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(30, 7) + threadIdx_x // 7 < 576: - if T.Mul(30, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(31, 7) + threadIdx_x // 7 < 576: - if T.Mul(31, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(32, 7) + threadIdx_x // 7 < 576: - if T.Mul(32, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(33, 7) + threadIdx_x // 7 < 576: - if T.Mul(33, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(34, 7) + threadIdx_x // 7 < 576: - if T.Mul(34, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(35, 7) + threadIdx_x // 7 < 576: - if T.Mul(35, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(36, 7) + threadIdx_x // 7 < 576: - if T.Mul(36, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(37, 7) + threadIdx_x // 7 < 576: - if T.Mul(37, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(38, 7) + threadIdx_x // 7 < 576: - if T.Mul(38, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(39, 7) + threadIdx_x // 7 < 576: - if T.Mul(39, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(40, 7) + threadIdx_x // 7 < 576: - if T.Mul(40, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(41, 7) + threadIdx_x // 7 < 576: - if T.Mul(41, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(42, 7) + threadIdx_x // 7 < 576: - if T.Mul(42, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(43, 7) + threadIdx_x // 7 < 576: - if T.Mul(43, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(44, 7) + threadIdx_x // 7 < 576: - if T.Mul(44, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(45, 7) + threadIdx_x // 7 < 576: - if T.Mul(45, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(46, 7) + threadIdx_x // 7 < 576: - if T.Mul(46, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(47, 7) + threadIdx_x // 7 < 576: - if T.Mul(47, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(48, 7) + threadIdx_x // 7 < 576: - if T.Mul(48, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(49, 7) + threadIdx_x // 7 < 576: - if T.Mul(49, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(50, 7) + threadIdx_x // 7 < 576: - if T.Mul(50, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(51, 7) + threadIdx_x // 7 < 576: - if T.Mul(51, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(52, 7) + threadIdx_x // 7 < 576: - if T.Mul(52, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(53, 7) + threadIdx_x // 7 < 576: - if T.Mul(53, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(54, 7) + threadIdx_x // 7 < 576: - if T.Mul(54, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(55, 7) + threadIdx_x // 7 < 576: - if T.Mul(55, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(56, 7) + threadIdx_x // 7 < 576: - if T.Mul(56, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(57, 7) + threadIdx_x // 7 < 576: - if T.Mul(57, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(58, 7) + threadIdx_x // 7 < 576: - if T.Mul(58, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(59, 7) + threadIdx_x // 7 < 576: - if T.Mul(59, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(60, 7) + threadIdx_x // 7 < 576: - if T.Mul(60, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(61, 7) + threadIdx_x // 7 < 576: - if T.Mul(61, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(62, 7) + threadIdx_x // 7 < 576: - if T.Mul(62, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(63, 7) + threadIdx_x // 7 < 576: - if T.Mul(63, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(64, 7) + threadIdx_x // 7 < 576: - if T.Mul(64, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(65, 7) + threadIdx_x // 7 < 576: - if T.Mul(65, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(66, 7) + threadIdx_x // 7 < 576: - if T.Mul(66, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(67, 7) + threadIdx_x // 7 < 576: - if T.Mul(67, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(68, 7) + threadIdx_x // 7 < 576: - if T.Mul(68, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(69, 7) + threadIdx_x // 7 < 576: - if T.Mul(69, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(70, 7) + threadIdx_x // 7 < 576: - if T.Mul(70, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(71, 7) + threadIdx_x // 7 < 576: - if T.Mul(71, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(72, 7) + threadIdx_x // 7 < 576: - if T.Mul(72, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(73, 7) + threadIdx_x // 7 < 576: - if T.Mul(73, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(74, 7) + threadIdx_x // 7 < 576: - if T.Mul(74, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(75, 7) + threadIdx_x // 7 < 576: - if T.Mul(75, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(76, 7) + threadIdx_x // 7 < 576: - if T.Mul(76, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(77, 7) + threadIdx_x // 7 < 576: - if T.Mul(77, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(78, 7) + threadIdx_x // 7 < 576: - if T.Mul(78, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(79, 7) + threadIdx_x // 7 < 576: - if T.Mul(79, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(80, 7) + threadIdx_x // 7 < 576: - if T.Mul(80, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(81, 7) + threadIdx_x // 7 < 576: - if T.Mul(81, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(82, 7) + threadIdx_x // 7 < 576: - if T.Mul(82, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - threadIdx_x_1 = T.env_thread("threadIdx.x") - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(0, 49) + threadIdx_x_1 < 1536: - blockIdx_x = T.int32() - kernel_shared[T.Mul(0, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - blockIdx_x = T.int32() - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(1, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(1, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(2, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(2, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(3, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(3, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(4, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(4, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(5, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(5, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(6, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(6, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(7, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(7, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(8, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(8, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(9, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(9, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(10, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(10, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(11, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(11, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(12, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(12, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(13, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(13, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(14, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(14, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(15, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(15, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(16, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(16, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(17, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(17, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(18, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(18, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(19, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(19, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(20, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(20, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(21, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(21, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(22, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(22, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(23, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(23, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(24, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(24, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(25, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(25, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(26, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(26, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(27, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(27, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(28, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(28, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(29, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(29, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(30, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(30, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(31, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(31, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - for rc_outer_inner in range(8): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - threadIdx_x_2 = T.int32() - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - -0 - -8 - -i1_inner - -conv2d_nchw[i1_inner] - -8 - -blockIdx_x * 8 - -blockIdx_x * 8 + i1_inner - -bias[blockIdx_x * 8 + i1_inner] - -conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner] - -T.float32(0.0) - -T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -392 - -blockIdx_x * 392 - -49 - -i1_inner * 49 - -blockIdx_x * 392 + i1_inner * 49 - -blockIdx_x * 392 + i1_inner * 49 + threadIdx_x - -compute = T.Buffer((25088,)) -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -i1_inner = T.int32() -bias = T.Buffer((512,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -for i1_inner in range(8): - compute = T.Buffer((25088,)) - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - bias = T.Buffer((512,)) - blockIdx_x = T.int32() - threadIdx_x = T.int32() - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) -conv2d_nchw[T.Mul(0, 4) + 1] = T.float32(0.0) -conv2d_nchw[T.Mul(0, 4) + 2] = T.float32(0.0) -conv2d_nchw[T.Mul(0, 4) + 3] = T.float32(0.0) -conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) -conv2d_nchw[T.Mul(1, 4) + 1] = T.float32(0.0) -conv2d_nchw[T.Mul(1, 4) + 2] = T.float32(0.0) -conv2d_nchw[T.Mul(1, 4) + 3] = T.float32(0.0) -blockIdx_x = T.int32() -threadIdx_x_2 = T.int32() -for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x = T.env_thread("threadIdx.x") - pad_temp_shared = T.Buffer((4032,), scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(0, 7) + threadIdx_x // 7 < 576: - if T.Mul(0, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(0, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x // 7) % 9 and (T.Mul(0, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(1, 7) + threadIdx_x // 7 < 576: - if T.Mul(1, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(1, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x // 7) % 9 and (T.Mul(1, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(2, 7) + threadIdx_x // 7 < 576: - if T.Mul(2, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(2, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x // 7) % 9 and (T.Mul(2, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(3, 7) + threadIdx_x // 7 < 576: - if T.Mul(3, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(3, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x // 7) % 9 and (T.Mul(3, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(4, 7) + threadIdx_x // 7 < 576: - if T.Mul(4, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(4, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x // 7) % 9 and (T.Mul(4, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(5, 7) + threadIdx_x // 7 < 576: - if T.Mul(5, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(5, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x // 7) % 9 and (T.Mul(5, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(6, 7) + threadIdx_x // 7 < 576: - if T.Mul(6, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(6, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x // 7) % 9 and (T.Mul(6, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(7, 7) + threadIdx_x // 7 < 576: - if T.Mul(7, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(7, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x // 7) % 9 and (T.Mul(7, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(8, 7) + threadIdx_x // 7 < 576: - if T.Mul(8, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(8, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x // 7) % 9 and (T.Mul(8, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(9, 7) + threadIdx_x // 7 < 576: - if T.Mul(9, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(9, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x // 7) % 9 and (T.Mul(9, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(10, 7) + threadIdx_x // 7 < 576: - if T.Mul(10, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(10, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x // 7) % 9 and (T.Mul(10, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(11, 7) + threadIdx_x // 7 < 576: - if T.Mul(11, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(11, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x // 7) % 9 and (T.Mul(11, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(12, 7) + threadIdx_x // 7 < 576: - if T.Mul(12, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(12, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x // 7) % 9 and (T.Mul(12, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(13, 7) + threadIdx_x // 7 < 576: - if T.Mul(13, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(13, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x // 7) % 9 and (T.Mul(13, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(14, 7) + threadIdx_x // 7 < 576: - if T.Mul(14, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(14, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x // 7) % 9 and (T.Mul(14, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(15, 7) + threadIdx_x // 7 < 576: - if T.Mul(15, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(15, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x // 7) % 9 and (T.Mul(15, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(16, 7) + threadIdx_x // 7 < 576: - if T.Mul(16, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(16, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x // 7) % 9 and (T.Mul(16, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(17, 7) + threadIdx_x // 7 < 576: - if T.Mul(17, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(17, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x // 7) % 9 and (T.Mul(17, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(18, 7) + threadIdx_x // 7 < 576: - if T.Mul(18, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(18, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x // 7) % 9 and (T.Mul(18, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(19, 7) + threadIdx_x // 7 < 576: - if T.Mul(19, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(19, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x // 7) % 9 and (T.Mul(19, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(20, 7) + threadIdx_x // 7 < 576: - if T.Mul(20, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(20, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x // 7) % 9 and (T.Mul(20, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(21, 7) + threadIdx_x // 7 < 576: - if T.Mul(21, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(21, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x // 7) % 9 and (T.Mul(21, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(22, 7) + threadIdx_x // 7 < 576: - if T.Mul(22, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(22, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x // 7) % 9 and (T.Mul(22, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(23, 7) + threadIdx_x // 7 < 576: - if T.Mul(23, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(23, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x // 7) % 9 and (T.Mul(23, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(24, 7) + threadIdx_x // 7 < 576: - if T.Mul(24, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(24, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x // 7) % 9 and (T.Mul(24, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(25, 7) + threadIdx_x // 7 < 576: - if T.Mul(25, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(25, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x // 7) % 9 and (T.Mul(25, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(26, 7) + threadIdx_x // 7 < 576: - if T.Mul(26, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(26, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x // 7) % 9 and (T.Mul(26, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(27, 7) + threadIdx_x // 7 < 576: - if T.Mul(27, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(27, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x // 7) % 9 and (T.Mul(27, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(28, 7) + threadIdx_x // 7 < 576: - if T.Mul(28, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(28, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x // 7) % 9 and (T.Mul(28, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(29, 7) + threadIdx_x // 7 < 576: - if T.Mul(29, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(29, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x // 7) % 9 and (T.Mul(29, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(30, 7) + threadIdx_x // 7 < 576: - if T.Mul(30, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(30, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x // 7) % 9 and (T.Mul(30, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(31, 7) + threadIdx_x // 7 < 576: - if T.Mul(31, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(31, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x // 7) % 9 and (T.Mul(31, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(32, 7) + threadIdx_x // 7 < 576: - if T.Mul(32, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(32, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x // 7) % 9 and (T.Mul(32, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(33, 7) + threadIdx_x // 7 < 576: - if T.Mul(33, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(33, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x // 7) % 9 and (T.Mul(33, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(34, 7) + threadIdx_x // 7 < 576: - if T.Mul(34, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(34, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x // 7) % 9 and (T.Mul(34, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(35, 7) + threadIdx_x // 7 < 576: - if T.Mul(35, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(35, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x // 7) % 9 and (T.Mul(35, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(36, 7) + threadIdx_x // 7 < 576: - if T.Mul(36, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(36, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x // 7) % 9 and (T.Mul(36, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(37, 7) + threadIdx_x // 7 < 576: - if T.Mul(37, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(37, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x // 7) % 9 and (T.Mul(37, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(38, 7) + threadIdx_x // 7 < 576: - if T.Mul(38, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(38, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x // 7) % 9 and (T.Mul(38, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(39, 7) + threadIdx_x // 7 < 576: - if T.Mul(39, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(39, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x // 7) % 9 and (T.Mul(39, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(40, 7) + threadIdx_x // 7 < 576: - if T.Mul(40, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(40, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x // 7) % 9 and (T.Mul(40, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(41, 7) + threadIdx_x // 7 < 576: - if T.Mul(41, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(41, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x // 7) % 9 and (T.Mul(41, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(42, 7) + threadIdx_x // 7 < 576: - if T.Mul(42, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(42, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x // 7) % 9 and (T.Mul(42, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(43, 7) + threadIdx_x // 7 < 576: - if T.Mul(43, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(43, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x // 7) % 9 and (T.Mul(43, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(44, 7) + threadIdx_x // 7 < 576: - if T.Mul(44, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(44, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x // 7) % 9 and (T.Mul(44, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(45, 7) + threadIdx_x // 7 < 576: - if T.Mul(45, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(45, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x // 7) % 9 and (T.Mul(45, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(46, 7) + threadIdx_x // 7 < 576: - if T.Mul(46, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(46, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x // 7) % 9 and (T.Mul(46, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(47, 7) + threadIdx_x // 7 < 576: - if T.Mul(47, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(47, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x // 7) % 9 and (T.Mul(47, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(48, 7) + threadIdx_x // 7 < 576: - if T.Mul(48, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(48, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x // 7) % 9 and (T.Mul(48, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(49, 7) + threadIdx_x // 7 < 576: - if T.Mul(49, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(49, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x // 7) % 9 and (T.Mul(49, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(50, 7) + threadIdx_x // 7 < 576: - if T.Mul(50, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(50, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x // 7) % 9 and (T.Mul(50, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(51, 7) + threadIdx_x // 7 < 576: - if T.Mul(51, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(51, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x // 7) % 9 and (T.Mul(51, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(52, 7) + threadIdx_x // 7 < 576: - if T.Mul(52, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(52, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x // 7) % 9 and (T.Mul(52, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(53, 7) + threadIdx_x // 7 < 576: - if T.Mul(53, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(53, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x // 7) % 9 and (T.Mul(53, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(54, 7) + threadIdx_x // 7 < 576: - if T.Mul(54, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(54, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x // 7) % 9 and (T.Mul(54, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(55, 7) + threadIdx_x // 7 < 576: - if T.Mul(55, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(55, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x // 7) % 9 and (T.Mul(55, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(56, 7) + threadIdx_x // 7 < 576: - if T.Mul(56, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(56, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x // 7) % 9 and (T.Mul(56, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(57, 7) + threadIdx_x // 7 < 576: - if T.Mul(57, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(57, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x // 7) % 9 and (T.Mul(57, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(58, 7) + threadIdx_x // 7 < 576: - if T.Mul(58, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(58, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x // 7) % 9 and (T.Mul(58, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(59, 7) + threadIdx_x // 7 < 576: - if T.Mul(59, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(59, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x // 7) % 9 and (T.Mul(59, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(60, 7) + threadIdx_x // 7 < 576: - if T.Mul(60, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(60, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x // 7) % 9 and (T.Mul(60, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(61, 7) + threadIdx_x // 7 < 576: - if T.Mul(61, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(61, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x // 7) % 9 and (T.Mul(61, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(62, 7) + threadIdx_x // 7 < 576: - if T.Mul(62, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(62, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x // 7) % 9 and (T.Mul(62, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(63, 7) + threadIdx_x // 7 < 576: - if T.Mul(63, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(63, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x // 7) % 9 and (T.Mul(63, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(64, 7) + threadIdx_x // 7 < 576: - if T.Mul(64, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(64, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x // 7) % 9 and (T.Mul(64, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(65, 7) + threadIdx_x // 7 < 576: - if T.Mul(65, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(65, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x // 7) % 9 and (T.Mul(65, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(66, 7) + threadIdx_x // 7 < 576: - if T.Mul(66, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(66, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x // 7) % 9 and (T.Mul(66, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(67, 7) + threadIdx_x // 7 < 576: - if T.Mul(67, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(67, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x // 7) % 9 and (T.Mul(67, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(68, 7) + threadIdx_x // 7 < 576: - if T.Mul(68, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(68, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x // 7) % 9 and (T.Mul(68, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(69, 7) + threadIdx_x // 7 < 576: - if T.Mul(69, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(69, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x // 7) % 9 and (T.Mul(69, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(70, 7) + threadIdx_x // 7 < 576: - if T.Mul(70, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(70, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x // 7) % 9 and (T.Mul(70, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(71, 7) + threadIdx_x // 7 < 576: - if T.Mul(71, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(71, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x // 7) % 9 and (T.Mul(71, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(72, 7) + threadIdx_x // 7 < 576: - if T.Mul(72, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(72, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x // 7) % 9 and (T.Mul(72, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(73, 7) + threadIdx_x // 7 < 576: - if T.Mul(73, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(73, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x // 7) % 9 and (T.Mul(73, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(74, 7) + threadIdx_x // 7 < 576: - if T.Mul(74, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(74, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x // 7) % 9 and (T.Mul(74, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(75, 7) + threadIdx_x // 7 < 576: - if T.Mul(75, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(75, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x // 7) % 9 and (T.Mul(75, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(76, 7) + threadIdx_x // 7 < 576: - if T.Mul(76, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(76, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x // 7) % 9 and (T.Mul(76, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(77, 7) + threadIdx_x // 7 < 576: - if T.Mul(77, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(77, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x // 7) % 9 and (T.Mul(77, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(78, 7) + threadIdx_x // 7 < 576: - if T.Mul(78, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(78, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x // 7) % 9 and (T.Mul(78, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(79, 7) + threadIdx_x // 7 < 576: - if T.Mul(79, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(79, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x // 7) % 9 and (T.Mul(79, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(80, 7) + threadIdx_x // 7 < 576: - if T.Mul(80, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(80, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x // 7) % 9 and (T.Mul(80, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(81, 7) + threadIdx_x // 7 < 576: - if T.Mul(81, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(81, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x // 7) % 9 and (T.Mul(81, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if T.Mul(82, 7) + threadIdx_x // 7 < 576: - if T.Mul(82, 49) + threadIdx_x < 4032: - pad_temp_shared[T.Mul(82, 49) + threadIdx_x] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x // 7) % 9 and (T.Mul(82, 7) + threadIdx_x // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x // 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - threadIdx_x_1 = T.env_thread("threadIdx.x") - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(0, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(0, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(1, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(1, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(2, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(2, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(3, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(3, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(4, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(4, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(5, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(5, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(6, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(6, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(7, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(7, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(8, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(8, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(9, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(9, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(10, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(10, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(11, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(11, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(12, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(12, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(13, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(13, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(14, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(14, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(15, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(15, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(16, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(16, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(17, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(17, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(18, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(18, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(19, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(19, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(20, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(20, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(21, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(21, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(22, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(22, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(23, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(23, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(24, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(24, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(25, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(25, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(26, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(26, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(27, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(27, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(28, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(28, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(29, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(29, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(30, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(30, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(31, 49) + threadIdx_x_1 < 1536: - kernel_shared[T.Mul(31, 49) + threadIdx_x_1] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_1) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_1) % 192 * 3 + rx_outer_outer] - for rc_outer_inner in range(8): - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x_2] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x_2] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] -for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x_2] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) - conv2d_nchw[T.Mul(0, 4) + 1] = T.float32(0.0) - conv2d_nchw[T.Mul(0, 4) + 2] = T.float32(0.0) - conv2d_nchw[T.Mul(0, 4) + 3] = T.float32(0.0) - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) - conv2d_nchw[T.Mul(1, 4) + 1] = T.float32(0.0) - conv2d_nchw[T.Mul(1, 4) + 2] = T.float32(0.0) - conv2d_nchw[T.Mul(1, 4) + 3] = T.float32(0.0) - blockIdx_x = T.int32() - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared = T.Buffer((4032,), scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(0, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(0, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(0, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(1, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(1, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(1, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(2, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(2, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(2, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(3, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(3, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(3, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(4, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(4, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(4, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(5, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(5, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(5, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(6, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(6, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(6, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(7, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(7, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(7, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(8, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(8, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(8, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(9, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(9, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(9, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(10, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(10, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(10, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(11, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(11, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(11, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(12, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(12, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(12, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(13, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(13, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(13, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(14, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(14, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(14, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(15, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(15, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(15, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(16, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(16, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(16, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(17, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(17, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(17, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(18, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(18, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(18, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(19, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(19, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(19, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(20, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(20, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(20, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(21, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(21, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(21, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(22, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(22, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(22, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(23, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(23, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(23, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(24, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(24, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(24, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(25, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(25, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(25, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(26, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(26, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(26, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(27, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(27, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(27, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(28, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(28, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(28, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(29, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(29, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(29, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(30, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(30, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(30, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(31, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(31, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(31, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(32, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(32, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(32, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(33, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(33, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(33, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(34, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(34, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(34, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(35, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(35, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(35, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(36, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(36, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(36, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(37, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(37, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(37, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(38, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(38, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(38, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(39, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(39, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(39, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(40, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(40, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(40, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(41, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(41, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(41, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(42, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(42, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(42, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(43, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(43, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(43, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(44, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(44, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(44, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(45, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(45, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(45, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(46, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(46, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(46, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(47, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(47, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(47, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(48, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(48, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(48, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(49, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(49, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(49, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(50, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(50, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(50, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(51, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(51, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(51, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(52, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(52, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(52, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(53, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(53, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(53, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(54, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(54, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(54, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(55, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(55, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(55, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(56, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(56, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(56, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(57, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(57, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(57, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(58, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(58, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(58, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(59, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(59, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(59, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(60, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(60, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(60, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(61, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(61, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(61, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(62, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(62, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(62, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(63, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(63, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(63, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(64, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(64, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(64, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(65, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(65, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(65, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(66, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(66, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(66, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(67, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(67, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(67, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(68, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(68, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(68, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(69, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(69, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(69, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(70, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(70, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(70, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(71, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(71, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(71, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(72, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(72, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(72, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(73, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(73, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(73, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(74, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(74, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(74, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(75, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(75, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(75, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(76, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(76, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(76, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(77, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(77, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(77, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(78, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(78, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(78, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(79, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(79, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(79, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(80, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(80, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(80, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(81, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(81, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(81, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(82, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(82, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(82, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(0, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(0, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(1, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(1, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(2, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(2, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(3, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(3, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(4, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(4, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(5, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(5, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(6, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(6, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(7, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(7, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(8, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(8, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(9, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(9, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(10, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(10, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(11, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(11, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(12, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(12, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(13, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(13, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(14, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(14, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(15, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(15, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(16, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(16, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(17, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(17, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(18, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(18, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(19, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(19, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(20, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(20, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(21, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(21, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(22, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(22, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(23, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(23, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(24, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(24, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(25, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(25, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(26, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(26, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(27, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(27, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(28, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(28, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(29, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(29, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(30, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(30, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(31, 49) + threadIdx_x_2 < 1536: - kernel_shared[T.Mul(31, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - for rc_outer_inner in range(8): - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -T.bool(True) - -with T.allocate([1536], "float32", "shared") as kernel_shared: - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) - conv2d_nchw[T.Mul(0, 4) + 1] = T.float32(0.0) - conv2d_nchw[T.Mul(0, 4) + 2] = T.float32(0.0) - conv2d_nchw[T.Mul(0, 4) + 3] = T.float32(0.0) - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) - conv2d_nchw[T.Mul(1, 4) + 1] = T.float32(0.0) - conv2d_nchw[T.Mul(1, 4) + 2] = T.float32(0.0) - conv2d_nchw[T.Mul(1, 4) + 3] = T.float32(0.0) - blockIdx_x = T.int32() - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared = T.Buffer((4032,), scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(0, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(0, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(0, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(1, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(1, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(1, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(2, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(2, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(2, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(3, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(3, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(3, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(4, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(4, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(4, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(5, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(5, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(5, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(6, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(6, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(6, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(7, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(7, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(7, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(8, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(8, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(8, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(9, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(9, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(9, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(10, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(10, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(10, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(11, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(11, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(11, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(12, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(12, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(12, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(13, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(13, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(13, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(14, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(14, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(14, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(15, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(15, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(15, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(16, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(16, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(16, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(17, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(17, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(17, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(18, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(18, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(18, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(19, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(19, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(19, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(20, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(20, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(20, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(21, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(21, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(21, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(22, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(22, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(22, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(23, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(23, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(23, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(24, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(24, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(24, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(25, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(25, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(25, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(26, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(26, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(26, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(27, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(27, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(27, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(28, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(28, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(28, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(29, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(29, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(29, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(30, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(30, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(30, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(31, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(31, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(31, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(32, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(32, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(32, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(33, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(33, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(33, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(34, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(34, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(34, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(35, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(35, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(35, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(36, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(36, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(36, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(37, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(37, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(37, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(38, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(38, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(38, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(39, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(39, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(39, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(40, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(40, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(40, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(41, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(41, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(41, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(42, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(42, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(42, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(43, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(43, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(43, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(44, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(44, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(44, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(45, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(45, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(45, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(46, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(46, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(46, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(47, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(47, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(47, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(48, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(48, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(48, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(49, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(49, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(49, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(50, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(50, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(50, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(51, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(51, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(51, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(52, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(52, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(52, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(53, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(53, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(53, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(54, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(54, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(54, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(55, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(55, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(55, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(56, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(56, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(56, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(57, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(57, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(57, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(58, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(58, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(58, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(59, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(59, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(59, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(60, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(60, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(60, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(61, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(61, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(61, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(62, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(62, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(62, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(63, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(63, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(63, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(64, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(64, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(64, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(65, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(65, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(65, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(66, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(66, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(66, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(67, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(67, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(67, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(68, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(68, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(68, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(69, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(69, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(69, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(70, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(70, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(70, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(71, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(71, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(71, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(72, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(72, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(72, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(73, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(73, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(73, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(74, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(74, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(74, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(75, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(75, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(75, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(76, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(76, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(76, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(77, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(77, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(77, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(78, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(78, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(78, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(79, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(79, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(79, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(80, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(80, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(80, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(81, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(81, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(81, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(82, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(82, 49) + threadIdx_x_1 < 4032: - pad_temp_shared[T.Mul(82, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(0, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(0, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(1, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(1, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(2, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(2, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(3, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(3, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(4, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(4, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(5, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(5, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(6, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(6, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(7, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(7, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(8, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(8, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(9, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(9, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(10, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(10, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(11, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(11, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(12, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(12, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(13, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(13, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(14, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(14, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(15, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(15, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(16, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(16, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(17, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(17, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(18, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(18, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(19, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(19, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(20, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(20, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(21, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(21, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(22, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(22, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(23, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(23, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(24, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(24, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(25, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(25, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(26, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(26, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(27, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(27, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(28, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(28, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(29, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(29, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(30, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(30, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(31, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(31, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - for rc_outer_inner in range(8): - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -T.bool(True) - -with T.allocate([4032], "float32", "shared") as pad_temp_shared: - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) - conv2d_nchw[T.Mul(0, 4) + 1] = T.float32(0.0) - conv2d_nchw[T.Mul(0, 4) + 2] = T.float32(0.0) - conv2d_nchw[T.Mul(0, 4) + 3] = T.float32(0.0) - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) - conv2d_nchw[T.Mul(1, 4) + 1] = T.float32(0.0) - conv2d_nchw[T.Mul(1, 4) + 2] = T.float32(0.0) - conv2d_nchw[T.Mul(1, 4) + 3] = T.float32(0.0) - blockIdx_x = T.int32() - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(0, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(0, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(0, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(1, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(1, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(1, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(2, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(2, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(2, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(3, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(3, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(3, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(4, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(4, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(4, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(5, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(5, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(5, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(6, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(6, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(6, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(7, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(7, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(7, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(8, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(8, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(8, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(9, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(9, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(9, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(10, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(10, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(10, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(11, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(11, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(11, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(12, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(12, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(12, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(13, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(13, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(13, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(14, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(14, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(14, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(15, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(15, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(15, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(16, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(16, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(16, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(17, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(17, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(17, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(18, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(18, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(18, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(19, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(19, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(19, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(20, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(20, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(20, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(21, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(21, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(21, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(22, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(22, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(22, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(23, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(23, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(23, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(24, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(24, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(24, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(25, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(25, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(25, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(26, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(26, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(26, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(27, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(27, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(27, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(28, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(28, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(28, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(29, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(29, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(29, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(30, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(30, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(30, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(31, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(31, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(31, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(32, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(32, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(32, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(33, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(33, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(33, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(34, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(34, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(34, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(35, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(35, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(35, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(36, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(36, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(36, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(37, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(37, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(37, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(38, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(38, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(38, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(39, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(39, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(39, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(40, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(40, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(40, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(41, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(41, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(41, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(42, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(42, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(42, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(43, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(43, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(43, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(44, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(44, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(44, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(45, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(45, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(45, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(46, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(46, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(46, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(47, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(47, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(47, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(48, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(48, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(48, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(49, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(49, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(49, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(50, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(50, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(50, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(51, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(51, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(51, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(52, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(52, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(52, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(53, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(53, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(53, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(54, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(54, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(54, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(55, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(55, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(55, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(56, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(56, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(56, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(57, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(57, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(57, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(58, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(58, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(58, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(59, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(59, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(59, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(60, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(60, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(60, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(61, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(61, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(61, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(62, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(62, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(62, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(63, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(63, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(63, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(64, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(64, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(64, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(65, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(65, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(65, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(66, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(66, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(66, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(67, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(67, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(67, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(68, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(68, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(68, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(69, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(69, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(69, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(70, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(70, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(70, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(71, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(71, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(71, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(72, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(72, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(72, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(73, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(73, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(73, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(74, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(74, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(74, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(75, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(75, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(75, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(76, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(76, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(76, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(77, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(77, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(77, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(78, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(78, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(78, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(79, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(79, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(79, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(80, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(80, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(80, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(81, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(81, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(81, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(82, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(82, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(82, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(0, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(0, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(1, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(1, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(2, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(2, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(3, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(3, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(4, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(4, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(5, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(5, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(6, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(6, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(7, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(7, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(8, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(8, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(9, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(9, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(10, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(10, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(11, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(11, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(12, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(12, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(13, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(13, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(14, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(14, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(15, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(15, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(16, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(16, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(17, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(17, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(18, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(18, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(19, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(19, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(20, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(20, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(21, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(21, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(22, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(22, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(23, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(23, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(24, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(24, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(25, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(25, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(26, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(26, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(27, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(27, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(28, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(28, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(29, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(29, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(30, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(30, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(31, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(31, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - for rc_outer_inner in range(8): - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 1] = conv2d_nchw[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 2] = conv2d_nchw[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(0, 4) + 3] = conv2d_nchw[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 1] = conv2d_nchw[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 2] = conv2d_nchw[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw[T.Mul(1, 4) + 3] = conv2d_nchw[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -T.bool(True) - -with T.allocate([8], "float32", "local") as conv2d_nchw: - pad_temp_shared = T.allocate([4032], "float32", "shared") - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) - conv2d_nchw_1[T.Mul(0, 4) + 1] = T.float32(0.0) - conv2d_nchw_1[T.Mul(0, 4) + 2] = T.float32(0.0) - conv2d_nchw_1[T.Mul(0, 4) + 3] = T.float32(0.0) - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) - conv2d_nchw_1[T.Mul(1, 4) + 1] = T.float32(0.0) - conv2d_nchw_1[T.Mul(1, 4) + 2] = T.float32(0.0) - conv2d_nchw_1[T.Mul(1, 4) + 3] = T.float32(0.0) - blockIdx_x = T.int32() - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(0, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(0, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(0, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(1, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(1, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(1, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(2, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(2, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(2, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(3, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(3, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(3, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(4, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(4, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(4, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(5, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(5, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(5, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(6, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(6, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(6, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(7, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(7, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(7, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(8, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(8, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(8, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(9, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(9, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(9, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(10, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(10, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(10, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(11, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(11, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(11, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(12, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(12, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(12, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(13, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(13, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(13, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(14, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(14, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(14, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(15, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(15, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(15, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(16, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(16, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(16, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(17, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(17, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(17, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(18, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(18, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(18, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(19, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(19, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(19, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(20, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(20, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(20, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(21, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(21, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(21, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(22, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(22, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(22, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(23, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(23, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(23, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(24, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(24, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(24, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(25, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(25, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(25, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(26, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(26, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(26, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(27, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(27, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(27, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(28, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(28, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(28, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(29, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(29, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(29, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(30, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(30, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(30, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(31, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(31, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(31, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(32, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(32, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(32, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(33, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(33, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(33, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(34, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(34, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(34, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(35, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(35, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(35, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(36, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(36, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(36, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(37, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(37, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(37, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(38, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(38, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(38, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(39, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(39, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(39, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(40, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(40, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(40, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(41, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(41, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(41, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(42, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(42, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(42, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(43, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(43, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(43, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(44, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(44, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(44, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(45, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(45, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(45, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(46, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(46, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(46, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(47, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(47, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(47, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(48, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(48, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(48, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(49, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(49, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(49, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(50, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(50, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(50, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(51, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(51, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(51, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(52, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(52, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(52, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(53, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(53, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(53, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(54, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(54, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(54, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(55, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(55, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(55, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(56, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(56, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(56, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(57, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(57, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(57, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(58, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(58, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(58, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(59, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(59, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(59, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(60, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(60, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(60, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(61, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(61, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(61, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(62, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(62, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(62, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(63, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(63, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(63, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(64, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(64, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(64, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(65, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(65, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(65, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(66, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(66, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(66, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(67, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(67, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(67, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(68, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(68, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(68, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(69, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(69, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(69, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(70, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(70, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(70, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(71, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(71, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(71, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(72, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(72, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(72, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(73, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(73, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(73, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(74, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(74, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(74, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(75, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(75, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(75, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(76, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(76, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(76, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(77, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(77, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(77, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(78, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(78, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(78, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(79, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(79, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(79, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(80, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(80, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(80, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(81, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(81, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(81, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(82, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(82, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(82, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(0, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(0, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(1, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(1, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(2, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(2, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(3, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(3, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(4, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(4, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(5, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(5, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(6, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(6, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(7, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(7, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(8, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(8, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(9, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(9, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(10, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(10, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(11, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(11, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(12, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(12, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(13, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(13, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(14, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(14, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(15, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(15, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(16, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(16, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(17, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(17, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(18, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(18, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(19, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(19, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(20, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(20, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(21, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(21, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(22, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(22, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(23, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(23, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(24, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(24, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(25, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(25, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(26, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(26, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(27, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(27, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(28, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(28, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(29, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(29, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(30, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(30, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(31, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(31, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - for rc_outer_inner in range(8): - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -with T.launch_thread("blockIdx.x", 64) as blockIdx_x: - conv2d_nchw = T.allocate([8], "float32", "local") - pad_temp_shared = T.allocate([4032], "float32", "shared") - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = T.float32(0.0) - conv2d_nchw_1[T.Mul(0, 4) + 1] = T.float32(0.0) - conv2d_nchw_1[T.Mul(0, 4) + 2] = T.float32(0.0) - conv2d_nchw_1[T.Mul(0, 4) + 3] = T.float32(0.0) - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = T.float32(0.0) - conv2d_nchw_1[T.Mul(1, 4) + 1] = T.float32(0.0) - conv2d_nchw_1[T.Mul(1, 4) + 2] = T.float32(0.0) - conv2d_nchw_1[T.Mul(1, 4) + 3] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(0, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(0, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(0, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(0, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(0, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(1, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(1, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(1, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(1, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(1, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(2, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(2, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(2, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(2, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(2, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(3, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(3, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(3, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(3, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(3, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(4, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(4, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(4, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(4, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(4, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(5, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(5, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(5, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(5, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(5, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(6, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(6, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(6, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(6, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(6, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(7, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(7, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(7, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(7, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(7, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(8, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(8, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(8, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(8, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(8, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(9, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(9, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(9, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(9, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(9, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(10, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(10, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(10, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(10, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(10, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(11, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(11, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(11, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(11, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(11, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(12, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(12, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(12, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(12, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(12, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(13, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(13, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(13, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(13, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(13, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(14, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(14, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(14, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(14, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(14, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(15, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(15, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(15, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(15, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(15, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(16, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(16, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(16, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(16, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(16, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(17, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(17, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(17, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(17, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(17, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(18, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(18, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(18, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(18, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(18, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(19, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(19, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(19, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(19, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(19, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(20, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(20, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(20, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(20, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(20, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(21, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(21, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(21, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(21, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(21, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(22, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(22, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(22, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(22, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(22, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(23, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(23, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(23, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(23, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(23, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(24, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(24, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(24, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(24, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(24, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(25, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(25, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(25, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(25, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(25, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(26, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(26, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(26, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(26, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(26, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(27, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(27, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(27, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(27, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(27, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(28, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(28, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(28, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(28, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(28, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(29, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(29, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(29, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(29, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(29, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(30, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(30, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(30, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(30, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(30, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(31, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(31, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(31, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(31, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(31, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(32, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(32, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(32, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(32, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(32, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(33, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(33, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(33, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(33, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(33, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(34, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(34, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(34, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(34, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(34, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(35, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(35, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(35, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(35, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(35, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(36, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(36, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(36, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(36, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(36, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(37, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(37, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(37, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(37, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(37, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(38, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(38, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(38, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(38, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(38, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(39, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(39, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(39, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(39, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(39, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(40, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(40, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(40, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(40, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(40, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(41, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(41, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(41, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(41, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(41, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(42, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(42, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(42, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(42, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(42, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(43, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(43, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(43, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(43, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(43, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(44, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(44, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(44, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(44, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(44, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(45, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(45, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(45, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(45, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(45, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(46, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(46, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(46, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(46, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(46, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(47, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(47, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(47, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(47, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(47, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(48, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(48, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(48, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(48, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(48, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(49, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(49, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(49, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(49, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(49, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(50, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(50, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(50, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(50, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(50, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(51, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(51, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(51, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(51, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(51, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(52, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(52, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(52, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(52, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(52, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(53, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(53, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(53, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(53, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(53, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(54, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(54, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(54, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(54, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(54, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(55, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(55, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(55, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(55, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(55, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(56, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(56, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(56, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(56, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(56, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(57, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(57, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(57, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(57, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(57, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(58, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(58, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(58, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(58, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(58, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(59, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(59, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(59, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(59, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(59, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(60, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(60, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(60, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(60, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(60, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(61, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(61, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(61, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(61, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(61, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(62, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(62, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(62, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(62, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(62, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(63, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(63, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(63, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(63, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(63, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(64, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(64, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(64, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(64, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(64, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(65, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(65, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(65, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(65, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(65, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(66, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(66, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(66, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(66, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(66, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(67, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(67, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(67, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(67, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(67, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(68, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(68, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(68, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(68, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(68, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(69, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(69, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(69, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(69, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(69, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(70, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(70, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(70, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(70, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(70, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(71, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(71, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(71, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(71, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(71, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(72, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(72, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(72, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(72, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(72, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(73, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(73, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(73, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(73, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(73, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(74, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(74, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(74, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(74, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(74, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(75, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(75, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(75, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(75, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(75, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(76, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(76, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(76, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(76, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(76, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(77, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(77, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(77, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(77, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(77, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(78, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(78, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(78, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(78, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(78, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(79, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(79, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(79, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(79, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(79, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(80, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(80, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(80, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(80, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(80, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(81, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(81, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(81, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(81, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(81, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.Mul(82, 7) + threadIdx_x_1 // 7 < 576: - if T.Mul(82, 49) + threadIdx_x_1 < 4032: - pad_temp_shared_1[T.Mul(82, 49) + threadIdx_x_1] = T.if_then_else(1 <= (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 and (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (T.Mul(82, 7) + threadIdx_x_1 // 7) // 9 * 49 + (T.Mul(82, 7) + threadIdx_x_1 // 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(0, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(0, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(0, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(0, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(1, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(1, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(1, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(1, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(2, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(2, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(2, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(2, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(3, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(3, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(3, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(3, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(4, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(4, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(4, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(4, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(5, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(5, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(5, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(5, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(6, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(6, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(6, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(6, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(7, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(7, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(7, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(7, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(8, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(8, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(8, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(8, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(9, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(9, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(9, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(9, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(10, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(10, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(10, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(10, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(11, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(11, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(11, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(11, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(12, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(12, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(12, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(12, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(13, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(13, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(13, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(13, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(14, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(14, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(14, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(14, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(15, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(15, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(15, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(15, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(16, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(16, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(16, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(16, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(17, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(17, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(17, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(17, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(18, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(18, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(18, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(18, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(19, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(19, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(19, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(19, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(20, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(20, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(20, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(20, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(21, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(21, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(21, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(21, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(22, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(22, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(22, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(22, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(23, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(23, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(23, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(23, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(24, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(24, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(24, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(24, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(25, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(25, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(25, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(25, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(26, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(26, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(26, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(26, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(27, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(27, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(27, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(27, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(28, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(28, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(28, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(28, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(29, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(29, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(29, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(29, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(30, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(30, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(30, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(30, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - if T.Mul(31, 49) + threadIdx_x_2 < 1536: - kernel_shared_1[T.Mul(31, 49) + threadIdx_x_2] = kernel[blockIdx_x * 36864 + (T.Mul(31, 49) + threadIdx_x_2) // 192 * 4608 + rc_outer_outer * 576 + (T.Mul(31, 49) + threadIdx_x_2) % 192 * 3 + rx_outer_outer] - for rc_outer_inner in range(8): - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(0, 7) + threadIdx_x] * kernel_shared_1[T.Add(T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3), 0)] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(1, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 1] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(0, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 1] = conv2d_nchw_1[T.Mul(0, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 2] = conv2d_nchw_1[T.Mul(0, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(0, 4) + 3] = conv2d_nchw_1[T.Mul(0, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(0, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(0, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(0, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(1, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(1, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(2, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(2, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(3, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(3, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(4, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(4, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(5, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(5, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(6, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(6, 3) + 2] - conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] = conv2d_nchw_1[T.Add(T.Mul(1, 4), 0)] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(0, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 1] = conv2d_nchw_1[T.Mul(1, 4) + 1] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(1, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 2] = conv2d_nchw_1[T.Mul(1, 4) + 2] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(2, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - conv2d_nchw_1[T.Mul(1, 4) + 3] = conv2d_nchw_1[T.Mul(1, 4) + 3] + pad_temp_shared_1[rc_outer_inner * 504 + T.Mul(7, 63) + T.Mul(2, 7) + threadIdx_x] * kernel_shared_1[T.Mul(1, 768) + T.Mul(3, 192) + rc_outer_inner * 24 + T.Mul(7, 3) + 2] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -# from tvm.script import ir as I -# from tvm.script import tir as T - -@I.ir_module -class Module: - @T.prim_func - def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")): - T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)}) - blockIdx_x = T.launch_thread("blockIdx.x", 64) - conv2d_nchw = T.allocate([8], "float32", "local") - pad_temp_shared = T.allocate([4032], "float32", "shared") - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - conv2d_nchw_1[0] = T.float32(0.0) - conv2d_nchw_1[1] = T.float32(0.0) - conv2d_nchw_1[2] = T.float32(0.0) - conv2d_nchw_1[3] = T.float32(0.0) - conv2d_nchw_1[4] = T.float32(0.0) - conv2d_nchw_1[5] = T.float32(0.0) - conv2d_nchw_1[6] = T.float32(0.0) - conv2d_nchw_1[7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - cse_var_2: T.int32 = rc_outer_outer * 3136 - cse_var_1: T.int32 = rc_outer_outer * 576 - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data_1 = T.Buffer((25088,), data=data.data) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 14: - pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel_1 = T.Buffer((2359296,), data=kernel.data) - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 49] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 98] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 147] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 196] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 245] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 294] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 343] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 441] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 490] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 539] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 588] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 637] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 686] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 735] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 833] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 882] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 931] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 980] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1029] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1078] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1127] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1176] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1225] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1274] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1323] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1372] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1421] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1470] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_2, 49): - if threadIdx_x_2 < 17: - kernel_shared_1[threadIdx_x_2 + 1519] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - cse_var_3: T.int32 = rc_outer_inner * 24 - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 192] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 384] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 576] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 195] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 387] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 579] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 6] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 198] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 390] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 582] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 9] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 201] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 393] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 585] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 12] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 204] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 396] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 588] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 15] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 207] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 399] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 591] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 18] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 210] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 402] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 594] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 21] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 213] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 405] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 597] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 768] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 960] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1152] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1344] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 771] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 963] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1155] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1347] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 774] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 966] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1158] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1350] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 777] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 969] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1161] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1353] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 780] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 972] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1164] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1356] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 783] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 975] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1167] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1359] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 786] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 978] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1170] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1362] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 789] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 981] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1173] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1365] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 193] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 385] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 577] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 4] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 196] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 388] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 580] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 7] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 199] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 391] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 583] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 10] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 202] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 394] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 586] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 13] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 205] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 397] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 589] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 16] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 208] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 400] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 592] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 19] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 211] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 403] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 595] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 22] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 214] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 406] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 598] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 769] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 961] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1153] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1345] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 772] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 964] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1156] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1348] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 775] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 967] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1159] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1351] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 778] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 970] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1162] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1354] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 781] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 973] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1165] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1357] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 784] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 976] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1168] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1360] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 787] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 979] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1171] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1363] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 790] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 982] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1174] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1366] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 2] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 194] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 386] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 578] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 5] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 197] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 389] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 581] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 8] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 200] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 392] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 584] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 11] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 203] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 395] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 587] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 14] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 206] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 398] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 590] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 17] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 209] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 401] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 593] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 20] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 212] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 404] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 596] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 23] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 215] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 407] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 599] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 770] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 962] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1154] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1346] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 773] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 965] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1157] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1349] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 776] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 968] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1160] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1352] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 779] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 971] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1163] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1355] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 782] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 974] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1166] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1358] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 785] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 977] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1169] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1361] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 788] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 980] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1172] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1364] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 791] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 983] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1175] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1367] - for i1_inner in range(8): - compute_1 = T.Buffer((25088,), data=compute.data) - bias_1 = T.Buffer((512,), data=bias.data) - compute_1[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 8 + i1_inner], T.float32(0.0)) -Phase 3 --------------------- -64 - -8 - -4032 - -1536 - -49 - -T.float32(0.0) - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[0] = T.float32(0.0) - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[1] = T.float32(0.0) - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[2] = T.float32(0.0) - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[3] = T.float32(0.0) - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[4] = T.float32(0.0) - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[5] = T.float32(0.0) - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[6] = T.float32(0.0) - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[7] = T.float32(0.0) - -0 - -8 - -0 - -3 - -49 - -7 - -threadIdx_x - -7 <= threadIdx_x - -1 - -rx_outer_outer - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -rc_outer_outer - -3136 - -rc_outer_outer * 3136 - -rc_outer_outer * 3136 + threadIdx_x - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - -8 - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8 - -data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8] - -T.float32(0.0) - -T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -1 <= (threadIdx_x // 7 + 7) % 9 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -8 - -(threadIdx_x // 7 + 7) % 9 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -49 - -threadIdx_x + 49 - -63 - -(threadIdx_x + 49) // 63 - -49 - -(threadIdx_x + 49) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -7 - -(threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -49 - -threadIdx_x + 49 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -1 <= (threadIdx_x // 7 + 5) % 9 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -8 - -(threadIdx_x // 7 + 5) % 9 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -98 - -threadIdx_x + 98 - -63 - -(threadIdx_x + 98) // 63 - -49 - -(threadIdx_x + 98) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -7 - -(threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -98 - -threadIdx_x + 98 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -1 <= (threadIdx_x // 7 + 3) % 9 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -8 - -(threadIdx_x // 7 + 3) % 9 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -147 - -threadIdx_x + 147 - -63 - -(threadIdx_x + 147) // 63 - -49 - -(threadIdx_x + 147) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -7 - -(threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -147 - -threadIdx_x + 147 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -196 - -threadIdx_x + 196 - -63 - -(threadIdx_x + 196) // 63 - -49 - -(threadIdx_x + 196) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - -1 - -rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 - -data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] - -T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -196 - -threadIdx_x + 196 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -threadIdx_x = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -1 <= (threadIdx_x // 7 + 8) % 9 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -8 - -(threadIdx_x // 7 + 8) % 9 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -245 - -threadIdx_x + 245 - -63 - -(threadIdx_x + 245) // 63 - -49 - -(threadIdx_x + 245) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -7 - -(threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -245 - -threadIdx_x + 245 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -1 <= (threadIdx_x // 7 + 6) % 9 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -8 - -(threadIdx_x // 7 + 6) % 9 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -294 - -threadIdx_x + 294 - -63 - -(threadIdx_x + 294) // 63 - -49 - -(threadIdx_x + 294) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -7 - -(threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -294 - -threadIdx_x + 294 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -1 <= (threadIdx_x // 7 + 4) % 9 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -8 - -(threadIdx_x // 7 + 4) % 9 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -343 - -threadIdx_x + 343 - -63 - -(threadIdx_x + 343) // 63 - -49 - -(threadIdx_x + 343) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -7 - -(threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -343 - -threadIdx_x + 343 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -42 - -threadIdx_x < 42 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -392 - -threadIdx_x + 392 - -63 - -(threadIdx_x + 392) // 63 - -49 - -(threadIdx_x + 392) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer - -6 - -rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 - -data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] - -T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -392 - -threadIdx_x + 392 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -7 - -7 <= threadIdx_x - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -rc_outer_outer * 3136 + threadIdx_x - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - -335 - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335 - -data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335] - -T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) - -441 - -threadIdx_x + 441 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -1 <= (threadIdx_x // 7 + 7) % 9 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -8 - -(threadIdx_x // 7 + 7) % 9 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -490 - -threadIdx_x + 490 - -63 - -(threadIdx_x + 490) // 63 - -49 - -(threadIdx_x + 490) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -7 - -(threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -490 - -threadIdx_x + 490 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -1 <= (threadIdx_x // 7 + 5) % 9 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -8 - -(threadIdx_x // 7 + 5) % 9 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -539 - -threadIdx_x + 539 - -63 - -(threadIdx_x + 539) // 63 - -49 - -(threadIdx_x + 539) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -7 - -(threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -539 - -threadIdx_x + 539 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -1 <= (threadIdx_x // 7 + 3) % 9 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -8 - -(threadIdx_x // 7 + 3) % 9 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -588 - -threadIdx_x + 588 - -63 - -(threadIdx_x + 588) // 63 - -49 - -(threadIdx_x + 588) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -7 - -(threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -588 - -threadIdx_x + 588 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -637 - -threadIdx_x + 637 - -63 - -(threadIdx_x + 637) // 63 - -49 - -(threadIdx_x + 637) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - -1 - -rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 - -data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] - -T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -637 - -threadIdx_x + 637 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -threadIdx_x = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -1 <= (threadIdx_x // 7 + 8) % 9 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -8 - -(threadIdx_x // 7 + 8) % 9 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -686 - -threadIdx_x + 686 - -63 - -(threadIdx_x + 686) // 63 - -49 - -(threadIdx_x + 686) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -7 - -(threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -686 - -threadIdx_x + 686 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -1 <= (threadIdx_x // 7 + 6) % 9 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -8 - -(threadIdx_x // 7 + 6) % 9 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -735 - -threadIdx_x + 735 - -63 - -(threadIdx_x + 735) // 63 - -49 - -(threadIdx_x + 735) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -7 - -(threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -735 - -threadIdx_x + 735 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -1 <= (threadIdx_x // 7 + 4) % 9 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -8 - -(threadIdx_x // 7 + 4) % 9 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -784 - -threadIdx_x + 784 - -63 - -(threadIdx_x + 784) // 63 - -49 - -(threadIdx_x + 784) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -7 - -(threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -784 - -threadIdx_x + 784 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -42 - -threadIdx_x < 42 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -833 - -threadIdx_x + 833 - -63 - -(threadIdx_x + 833) // 63 - -49 - -(threadIdx_x + 833) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer - -6 - -rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 - -data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] - -T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -833 - -threadIdx_x + 833 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -7 - -7 <= threadIdx_x - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -rc_outer_outer * 3136 + threadIdx_x - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - -678 - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678 - -data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678] - -T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) - -882 - -threadIdx_x + 882 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -1 <= (threadIdx_x // 7 + 7) % 9 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -8 - -(threadIdx_x // 7 + 7) % 9 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -931 - -threadIdx_x + 931 - -63 - -(threadIdx_x + 931) // 63 - -49 - -(threadIdx_x + 931) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -7 - -(threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -931 - -threadIdx_x + 931 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -1 <= (threadIdx_x // 7 + 5) % 9 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -8 - -(threadIdx_x // 7 + 5) % 9 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -980 - -threadIdx_x + 980 - -63 - -(threadIdx_x + 980) // 63 - -49 - -(threadIdx_x + 980) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -7 - -(threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -980 - -threadIdx_x + 980 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -1 <= (threadIdx_x // 7 + 3) % 9 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -8 - -(threadIdx_x // 7 + 3) % 9 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1029 - -threadIdx_x + 1029 - -63 - -(threadIdx_x + 1029) // 63 - -49 - -(threadIdx_x + 1029) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -7 - -(threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1029 - -threadIdx_x + 1029 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1078 - -threadIdx_x + 1078 - -63 - -(threadIdx_x + 1078) // 63 - -49 - -(threadIdx_x + 1078) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - -1 - -rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] - -T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1078 - -threadIdx_x + 1078 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -threadIdx_x = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -1 <= (threadIdx_x // 7 + 8) % 9 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -8 - -(threadIdx_x // 7 + 8) % 9 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1127 - -threadIdx_x + 1127 - -63 - -(threadIdx_x + 1127) // 63 - -49 - -(threadIdx_x + 1127) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -7 - -(threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1127 - -threadIdx_x + 1127 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -1 <= (threadIdx_x // 7 + 6) % 9 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -8 - -(threadIdx_x // 7 + 6) % 9 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1176 - -threadIdx_x + 1176 - -63 - -(threadIdx_x + 1176) // 63 - -49 - -(threadIdx_x + 1176) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -7 - -(threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1176 - -threadIdx_x + 1176 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -1 <= (threadIdx_x // 7 + 4) % 9 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -8 - -(threadIdx_x // 7 + 4) % 9 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1225 - -threadIdx_x + 1225 - -63 - -(threadIdx_x + 1225) // 63 - -49 - -(threadIdx_x + 1225) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -7 - -(threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1225 - -threadIdx_x + 1225 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -42 - -threadIdx_x < 42 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1274 - -threadIdx_x + 1274 - -63 - -(threadIdx_x + 1274) // 63 - -49 - -(threadIdx_x + 1274) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer - -6 - -rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] - -T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -1274 - -threadIdx_x + 1274 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -7 - -7 <= threadIdx_x - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -rc_outer_outer * 3136 + threadIdx_x - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - -1021 - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021 - -data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021] - -T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) - -1323 - -threadIdx_x + 1323 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -1 <= (threadIdx_x // 7 + 7) % 9 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -8 - -(threadIdx_x // 7 + 7) % 9 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1372 - -threadIdx_x + 1372 - -63 - -(threadIdx_x + 1372) // 63 - -49 - -(threadIdx_x + 1372) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -7 - -(threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1372 - -threadIdx_x + 1372 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -1 <= (threadIdx_x // 7 + 5) % 9 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -8 - -(threadIdx_x // 7 + 5) % 9 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1421 - -threadIdx_x + 1421 - -63 - -(threadIdx_x + 1421) // 63 - -49 - -(threadIdx_x + 1421) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -7 - -(threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1421 - -threadIdx_x + 1421 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -1 <= (threadIdx_x // 7 + 3) % 9 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -8 - -(threadIdx_x // 7 + 3) % 9 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1470 - -threadIdx_x + 1470 - -63 - -(threadIdx_x + 1470) // 63 - -49 - -(threadIdx_x + 1470) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -7 - -(threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1470 - -threadIdx_x + 1470 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1519 - -threadIdx_x + 1519 - -63 - -(threadIdx_x + 1519) // 63 - -49 - -(threadIdx_x + 1519) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - -1 - -rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] - -T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1519 - -threadIdx_x + 1519 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -threadIdx_x = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -1 <= (threadIdx_x // 7 + 8) % 9 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -8 - -(threadIdx_x // 7 + 8) % 9 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1568 - -threadIdx_x + 1568 - -63 - -(threadIdx_x + 1568) // 63 - -49 - -(threadIdx_x + 1568) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -7 - -(threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1568 - -threadIdx_x + 1568 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -1 <= (threadIdx_x // 7 + 6) % 9 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -8 - -(threadIdx_x // 7 + 6) % 9 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1617 - -threadIdx_x + 1617 - -63 - -(threadIdx_x + 1617) // 63 - -49 - -(threadIdx_x + 1617) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -7 - -(threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1617 - -threadIdx_x + 1617 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -1 <= (threadIdx_x // 7 + 4) % 9 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -8 - -(threadIdx_x // 7 + 4) % 9 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1666 - -threadIdx_x + 1666 - -63 - -(threadIdx_x + 1666) // 63 - -49 - -(threadIdx_x + 1666) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -7 - -(threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1666 - -threadIdx_x + 1666 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -42 - -threadIdx_x < 42 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1715 - -threadIdx_x + 1715 - -63 - -(threadIdx_x + 1715) // 63 - -49 - -(threadIdx_x + 1715) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer - -6 - -rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] - -T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -1715 - -threadIdx_x + 1715 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -7 - -7 <= threadIdx_x - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -rc_outer_outer * 3136 + threadIdx_x - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - -1364 - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364 - -data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364] - -T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) - -1764 - -threadIdx_x + 1764 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -1 <= (threadIdx_x // 7 + 7) % 9 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -8 - -(threadIdx_x // 7 + 7) % 9 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1813 - -threadIdx_x + 1813 - -63 - -(threadIdx_x + 1813) // 63 - -49 - -(threadIdx_x + 1813) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -7 - -(threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1813 - -threadIdx_x + 1813 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -1 <= (threadIdx_x // 7 + 5) % 9 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -8 - -(threadIdx_x // 7 + 5) % 9 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1862 - -threadIdx_x + 1862 - -63 - -(threadIdx_x + 1862) // 63 - -49 - -(threadIdx_x + 1862) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -7 - -(threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1862 - -threadIdx_x + 1862 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -1 <= (threadIdx_x // 7 + 3) % 9 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -8 - -(threadIdx_x // 7 + 3) % 9 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1911 - -threadIdx_x + 1911 - -63 - -(threadIdx_x + 1911) // 63 - -49 - -(threadIdx_x + 1911) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -7 - -(threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1911 - -threadIdx_x + 1911 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -1960 - -threadIdx_x + 1960 - -63 - -(threadIdx_x + 1960) // 63 - -49 - -(threadIdx_x + 1960) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - -1 - -rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 - -data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] - -T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1960 - -threadIdx_x + 1960 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -threadIdx_x = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -1 <= (threadIdx_x // 7 + 8) % 9 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -8 - -(threadIdx_x // 7 + 8) % 9 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2009 - -threadIdx_x + 2009 - -63 - -(threadIdx_x + 2009) // 63 - -49 - -(threadIdx_x + 2009) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -7 - -(threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2009 - -threadIdx_x + 2009 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -1 <= (threadIdx_x // 7 + 6) % 9 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -8 - -(threadIdx_x // 7 + 6) % 9 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2058 - -threadIdx_x + 2058 - -63 - -(threadIdx_x + 2058) // 63 - -49 - -(threadIdx_x + 2058) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -7 - -(threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2058 - -threadIdx_x + 2058 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -1 <= (threadIdx_x // 7 + 4) % 9 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -8 - -(threadIdx_x // 7 + 4) % 9 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2107 - -threadIdx_x + 2107 - -63 - -(threadIdx_x + 2107) // 63 - -49 - -(threadIdx_x + 2107) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -7 - -(threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2107 - -threadIdx_x + 2107 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -42 - -threadIdx_x < 42 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2156 - -threadIdx_x + 2156 - -63 - -(threadIdx_x + 2156) // 63 - -49 - -(threadIdx_x + 2156) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer - -6 - -rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] - -T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -2156 - -threadIdx_x + 2156 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -7 - -7 <= threadIdx_x - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -rc_outer_outer * 3136 + threadIdx_x - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - -1707 - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707 - -data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707] - -T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) - -2205 - -threadIdx_x + 2205 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -1 <= (threadIdx_x // 7 + 7) % 9 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -8 - -(threadIdx_x // 7 + 7) % 9 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2254 - -threadIdx_x + 2254 - -63 - -(threadIdx_x + 2254) // 63 - -49 - -(threadIdx_x + 2254) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -7 - -(threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2254 - -threadIdx_x + 2254 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -1 <= (threadIdx_x // 7 + 5) % 9 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -8 - -(threadIdx_x // 7 + 5) % 9 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2303 - -threadIdx_x + 2303 - -63 - -(threadIdx_x + 2303) // 63 - -49 - -(threadIdx_x + 2303) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -7 - -(threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2303 - -threadIdx_x + 2303 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -1 <= (threadIdx_x // 7 + 3) % 9 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -8 - -(threadIdx_x // 7 + 3) % 9 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2352 - -threadIdx_x + 2352 - -63 - -(threadIdx_x + 2352) // 63 - -49 - -(threadIdx_x + 2352) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -7 - -(threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2352 - -threadIdx_x + 2352 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2401 - -threadIdx_x + 2401 - -63 - -(threadIdx_x + 2401) // 63 - -49 - -(threadIdx_x + 2401) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - -1 - -rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] - -T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -2401 - -threadIdx_x + 2401 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -threadIdx_x = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -1 <= (threadIdx_x // 7 + 8) % 9 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -8 - -(threadIdx_x // 7 + 8) % 9 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2450 - -threadIdx_x + 2450 - -63 - -(threadIdx_x + 2450) // 63 - -49 - -(threadIdx_x + 2450) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -7 - -(threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2450 - -threadIdx_x + 2450 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -1 <= (threadIdx_x // 7 + 6) % 9 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -8 - -(threadIdx_x // 7 + 6) % 9 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2499 - -threadIdx_x + 2499 - -63 - -(threadIdx_x + 2499) // 63 - -49 - -(threadIdx_x + 2499) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -7 - -(threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2499 - -threadIdx_x + 2499 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -1 <= (threadIdx_x // 7 + 4) % 9 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -8 - -(threadIdx_x // 7 + 4) % 9 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2548 - -threadIdx_x + 2548 - -63 - -(threadIdx_x + 2548) // 63 - -49 - -(threadIdx_x + 2548) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -7 - -(threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2548 - -threadIdx_x + 2548 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -42 - -threadIdx_x < 42 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2597 - -threadIdx_x + 2597 - -63 - -(threadIdx_x + 2597) // 63 - -49 - -(threadIdx_x + 2597) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer - -6 - -rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] - -T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -2597 - -threadIdx_x + 2597 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -7 - -7 <= threadIdx_x - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -rc_outer_outer * 3136 + threadIdx_x - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - -2050 - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050 - -data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050] - -T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) - -2646 - -threadIdx_x + 2646 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -1 <= (threadIdx_x // 7 + 7) % 9 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -8 - -(threadIdx_x // 7 + 7) % 9 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2695 - -threadIdx_x + 2695 - -63 - -(threadIdx_x + 2695) // 63 - -49 - -(threadIdx_x + 2695) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -7 - -(threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2695 - -threadIdx_x + 2695 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -1 <= (threadIdx_x // 7 + 5) % 9 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -8 - -(threadIdx_x // 7 + 5) % 9 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2744 - -threadIdx_x + 2744 - -63 - -(threadIdx_x + 2744) // 63 - -49 - -(threadIdx_x + 2744) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -7 - -(threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2744 - -threadIdx_x + 2744 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -1 <= (threadIdx_x // 7 + 3) % 9 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -8 - -(threadIdx_x // 7 + 3) % 9 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2793 - -threadIdx_x + 2793 - -63 - -(threadIdx_x + 2793) // 63 - -49 - -(threadIdx_x + 2793) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -7 - -(threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2793 - -threadIdx_x + 2793 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2842 - -threadIdx_x + 2842 - -63 - -(threadIdx_x + 2842) // 63 - -49 - -(threadIdx_x + 2842) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - -1 - -rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] - -T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -2842 - -threadIdx_x + 2842 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -threadIdx_x = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -1 <= (threadIdx_x // 7 + 8) % 9 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -8 - -(threadIdx_x // 7 + 8) % 9 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2891 - -threadIdx_x + 2891 - -63 - -(threadIdx_x + 2891) // 63 - -49 - -(threadIdx_x + 2891) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -7 - -(threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2891 - -threadIdx_x + 2891 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -1 <= (threadIdx_x // 7 + 6) % 9 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -8 - -(threadIdx_x // 7 + 6) % 9 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2940 - -threadIdx_x + 2940 - -63 - -(threadIdx_x + 2940) // 63 - -49 - -(threadIdx_x + 2940) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -7 - -(threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2940 - -threadIdx_x + 2940 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -1 <= (threadIdx_x // 7 + 4) % 9 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -8 - -(threadIdx_x // 7 + 4) % 9 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -2989 - -threadIdx_x + 2989 - -63 - -(threadIdx_x + 2989) // 63 - -49 - -(threadIdx_x + 2989) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -7 - -(threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -2989 - -threadIdx_x + 2989 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -42 - -threadIdx_x < 42 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3038 - -threadIdx_x + 3038 - -63 - -(threadIdx_x + 3038) // 63 - -49 - -(threadIdx_x + 3038) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer - -6 - -rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] - -T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -3038 - -threadIdx_x + 3038 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -7 - -7 <= threadIdx_x - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -rc_outer_outer * 3136 + threadIdx_x - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - -2393 - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393 - -data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393] - -T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) - -3087 - -threadIdx_x + 3087 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -1 <= (threadIdx_x // 7 + 7) % 9 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -8 - -(threadIdx_x // 7 + 7) % 9 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3136 - -threadIdx_x + 3136 - -63 - -(threadIdx_x + 3136) // 63 - -49 - -(threadIdx_x + 3136) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -7 - -(threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3136 - -threadIdx_x + 3136 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -1 <= (threadIdx_x // 7 + 5) % 9 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -8 - -(threadIdx_x // 7 + 5) % 9 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3185 - -threadIdx_x + 3185 - -63 - -(threadIdx_x + 3185) // 63 - -49 - -(threadIdx_x + 3185) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -7 - -(threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3185 - -threadIdx_x + 3185 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -1 <= (threadIdx_x // 7 + 3) % 9 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -8 - -(threadIdx_x // 7 + 3) % 9 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3234 - -threadIdx_x + 3234 - -63 - -(threadIdx_x + 3234) // 63 - -49 - -(threadIdx_x + 3234) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -7 - -(threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3234 - -threadIdx_x + 3234 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3283 - -threadIdx_x + 3283 - -63 - -(threadIdx_x + 3283) // 63 - -49 - -(threadIdx_x + 3283) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - -1 - -rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] - -T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -3283 - -threadIdx_x + 3283 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -threadIdx_x = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -1 <= (threadIdx_x // 7 + 8) % 9 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -8 - -(threadIdx_x // 7 + 8) % 9 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3332 - -threadIdx_x + 3332 - -63 - -(threadIdx_x + 3332) // 63 - -49 - -(threadIdx_x + 3332) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -7 - -(threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3332 - -threadIdx_x + 3332 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -1 <= (threadIdx_x // 7 + 6) % 9 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -8 - -(threadIdx_x // 7 + 6) % 9 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3381 - -threadIdx_x + 3381 - -63 - -(threadIdx_x + 3381) // 63 - -49 - -(threadIdx_x + 3381) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -7 - -(threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3381 - -threadIdx_x + 3381 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -1 <= (threadIdx_x // 7 + 4) % 9 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -8 - -(threadIdx_x // 7 + 4) % 9 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3430 - -threadIdx_x + 3430 - -63 - -(threadIdx_x + 3430) // 63 - -49 - -(threadIdx_x + 3430) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -7 - -(threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3430 - -threadIdx_x + 3430 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -42 - -threadIdx_x < 42 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3479 - -threadIdx_x + 3479 - -63 - -(threadIdx_x + 3479) // 63 - -49 - -(threadIdx_x + 3479) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer - -6 - -rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] - -T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -3479 - -threadIdx_x + 3479 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -7 - -7 <= threadIdx_x - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -rc_outer_outer * 3136 + threadIdx_x - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - -2736 - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736 - -data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736] - -T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) - -3528 - -threadIdx_x + 3528 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -1 <= (threadIdx_x // 7 + 7) % 9 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -8 - -(threadIdx_x // 7 + 7) % 9 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3577 - -threadIdx_x + 3577 - -63 - -(threadIdx_x + 3577) // 63 - -49 - -(threadIdx_x + 3577) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 - -7 - -threadIdx_x // 7 - -7 - -threadIdx_x // 7 + 7 - -9 - -(threadIdx_x // 7 + 7) % 9 - -7 - -(threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3577 - -threadIdx_x + 3577 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -1 <= (threadIdx_x // 7 + 5) % 9 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -8 - -(threadIdx_x // 7 + 5) % 9 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3626 - -threadIdx_x + 3626 - -63 - -(threadIdx_x + 3626) // 63 - -49 - -(threadIdx_x + 3626) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 - -7 - -threadIdx_x // 7 - -5 - -threadIdx_x // 7 + 5 - -9 - -(threadIdx_x // 7 + 5) % 9 - -7 - -(threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3626 - -threadIdx_x + 3626 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -1 <= (threadIdx_x // 7 + 3) % 9 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -8 - -(threadIdx_x // 7 + 3) % 9 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3675 - -threadIdx_x + 3675 - -63 - -(threadIdx_x + 3675) // 63 - -49 - -(threadIdx_x + 3675) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 - -7 - -threadIdx_x // 7 - -3 - -threadIdx_x // 7 + 3 - -9 - -(threadIdx_x // 7 + 3) % 9 - -7 - -(threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3675 - -threadIdx_x + 3675 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3724 - -threadIdx_x + 3724 - -63 - -(threadIdx_x + 3724) // 63 - -49 - -(threadIdx_x + 3724) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - -1 - -rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1] - -T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -3724 - -threadIdx_x + 3724 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -threadIdx_x = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -1 <= (threadIdx_x // 7 + 8) % 9 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -8 - -(threadIdx_x // 7 + 8) % 9 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3773 - -threadIdx_x + 3773 - -63 - -(threadIdx_x + 3773) // 63 - -49 - -(threadIdx_x + 3773) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 - -7 - -threadIdx_x // 7 - -8 - -threadIdx_x // 7 + 8 - -9 - -(threadIdx_x // 7 + 8) % 9 - -7 - -(threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3773 - -threadIdx_x + 3773 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -1 <= (threadIdx_x // 7 + 6) % 9 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -8 - -(threadIdx_x // 7 + 6) % 9 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3822 - -threadIdx_x + 3822 - -63 - -(threadIdx_x + 3822) // 63 - -49 - -(threadIdx_x + 3822) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 - -7 - -threadIdx_x // 7 - -6 - -threadIdx_x // 7 + 6 - -9 - -(threadIdx_x // 7 + 6) % 9 - -7 - -(threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3822 - -threadIdx_x + 3822 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -1 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -1 <= (threadIdx_x // 7 + 4) % 9 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -8 - -(threadIdx_x // 7 + 4) % 9 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3871 - -threadIdx_x + 3871 - -63 - -(threadIdx_x + 3871) // 63 - -49 - -(threadIdx_x + 3871) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 - -7 - -threadIdx_x // 7 - -4 - -threadIdx_x // 7 + 4 - -9 - -(threadIdx_x // 7 + 4) % 9 - -7 - -(threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer - -7 - -threadIdx_x % 7 - -rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - -8 - -rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8] - -T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -3871 - -threadIdx_x + 3871 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - -42 - -threadIdx_x < 42 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -3920 - -threadIdx_x + 3920 - -63 - -(threadIdx_x + 3920) // 63 - -49 - -(threadIdx_x + 3920) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x - -rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer - -6 - -rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6 - -data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6] - -T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -3920 - -threadIdx_x + 3920 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - -7 - -7 <= threadIdx_x - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -rc_outer_outer * 3136 + threadIdx_x - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - -3079 - -rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079 - -data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079] - -T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) - -3969 - -threadIdx_x + 3969 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) - -14 - -threadIdx_x < 14 - -7 - -threadIdx_x < 7 - -1 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -1 <= rx_outer_outer + threadIdx_x % 7 - -threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 - -7 - -threadIdx_x % 7 - -rx_outer_outer + threadIdx_x % 7 - -8 - -rx_outer_outer + threadIdx_x % 7 < 8 - -threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8 - -3136 - -rc_outer_outer * 3136 - -4018 - -threadIdx_x + 4018 - -63 - -(threadIdx_x + 4018) // 63 - -49 - -(threadIdx_x + 4018) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 - -rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer - -rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x - -41 - -rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41 - -data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41] - -T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) - -4018 - -threadIdx_x + 4018 - -pad_temp_shared = T.Buffer((4032,), scope="shared") -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) - -threadIdx_x = T.int32() -if threadIdx_x < 14: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if threadIdx_x < 14: - pad_temp_shared = T.Buffer((4032,), scope="shared") - rx_outer_outer = T.int32() - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) - -49 - -blockIdx_x - -36864 - -blockIdx_x * 36864 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + rc_outer_outer * 576 - -threadIdx_x - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer] - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -rc_outer_outer = T.int32() -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer] - -36864 - -blockIdx_x * 36864 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -147 - -blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 147 - -kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 147] - -49 - -threadIdx_x + 49 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -rc_outer_outer = T.int32() -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 147] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 147] - -36864 - -blockIdx_x * 36864 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -294 - -blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 294 - -kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 294] - -98 - -threadIdx_x + 98 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -rc_outer_outer = T.int32() -threadIdx_x = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 294] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 294] - -36864 - -blockIdx_x * 36864 - -147 - -threadIdx_x + 147 - -192 - -(threadIdx_x + 147) // 192 - -4608 - -(threadIdx_x + 147) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 - -147 - -threadIdx_x + 147 - -192 - -(threadIdx_x + 147) % 192 - -3 - -(threadIdx_x + 147) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 147) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 147) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 147) % 192 * 3 + rx_outer_outer] - -147 - -threadIdx_x + 147 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 147) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 147) % 192 * 3 + rx_outer_outer] - -36864 - -blockIdx_x * 36864 - -196 - -threadIdx_x + 196 - -192 - -(threadIdx_x + 196) // 192 - -4608 - -(threadIdx_x + 196) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -12 - -blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 12 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 12] - -196 - -threadIdx_x + 196 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 12] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 12] - -36864 - -blockIdx_x * 36864 - -245 - -threadIdx_x + 245 - -192 - -(threadIdx_x + 245) // 192 - -4608 - -(threadIdx_x + 245) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -159 - -blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 159 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 159] - -245 - -threadIdx_x + 245 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 159] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 159] - -36864 - -blockIdx_x * 36864 - -294 - -threadIdx_x + 294 - -192 - -(threadIdx_x + 294) // 192 - -4608 - -(threadIdx_x + 294) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -306 - -blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 306 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 306] - -294 - -threadIdx_x + 294 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 306] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 306] - -36864 - -blockIdx_x * 36864 - -343 - -threadIdx_x + 343 - -192 - -(threadIdx_x + 343) // 192 - -4608 - -(threadIdx_x + 343) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 - -151 - -threadIdx_x + 151 - -192 - -(threadIdx_x + 151) % 192 - -3 - -(threadIdx_x + 151) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 151) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 151) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 151) % 192 * 3 + rx_outer_outer] - -343 - -threadIdx_x + 343 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 151) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 151) % 192 * 3 + rx_outer_outer] - -36864 - -blockIdx_x * 36864 - -392 - -threadIdx_x + 392 - -192 - -(threadIdx_x + 392) // 192 - -4608 - -(threadIdx_x + 392) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -24 - -blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 24 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 24] - -392 - -threadIdx_x + 392 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 24] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 24] - -36864 - -blockIdx_x * 36864 - -441 - -threadIdx_x + 441 - -192 - -(threadIdx_x + 441) // 192 - -4608 - -(threadIdx_x + 441) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -171 - -blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 171 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 171] - -441 - -threadIdx_x + 441 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 171] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 171] - -36864 - -blockIdx_x * 36864 - -490 - -threadIdx_x + 490 - -192 - -(threadIdx_x + 490) // 192 - -4608 - -(threadIdx_x + 490) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -318 - -blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 318 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 318] - -490 - -threadIdx_x + 490 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 318] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 318] - -36864 - -blockIdx_x * 36864 - -539 - -threadIdx_x + 539 - -192 - -(threadIdx_x + 539) // 192 - -4608 - -(threadIdx_x + 539) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 - -155 - -threadIdx_x + 155 - -192 - -(threadIdx_x + 155) % 192 - -3 - -(threadIdx_x + 155) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 155) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 155) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 155) % 192 * 3 + rx_outer_outer] - -539 - -threadIdx_x + 539 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 155) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 155) % 192 * 3 + rx_outer_outer] - -36864 - -blockIdx_x * 36864 - -588 - -threadIdx_x + 588 - -192 - -(threadIdx_x + 588) // 192 - -4608 - -(threadIdx_x + 588) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -36 - -blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 36 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 36] - -588 - -threadIdx_x + 588 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 36] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 36] - -36864 - -blockIdx_x * 36864 - -637 - -threadIdx_x + 637 - -192 - -(threadIdx_x + 637) // 192 - -4608 - -(threadIdx_x + 637) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -183 - -blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 183 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 183] - -637 - -threadIdx_x + 637 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 183] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 183] - -36864 - -blockIdx_x * 36864 - -686 - -threadIdx_x + 686 - -192 - -(threadIdx_x + 686) // 192 - -4608 - -(threadIdx_x + 686) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -330 - -blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 330 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 330] - -686 - -threadIdx_x + 686 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 330] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 330] - -36864 - -blockIdx_x * 36864 - -735 - -threadIdx_x + 735 - -192 - -(threadIdx_x + 735) // 192 - -4608 - -(threadIdx_x + 735) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 - -159 - -threadIdx_x + 159 - -192 - -(threadIdx_x + 159) % 192 - -3 - -(threadIdx_x + 159) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 159) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 159) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 159) % 192 * 3 + rx_outer_outer] - -735 - -threadIdx_x + 735 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 159) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 159) % 192 * 3 + rx_outer_outer] - -36864 - -blockIdx_x * 36864 - -784 - -threadIdx_x + 784 - -192 - -(threadIdx_x + 784) // 192 - -4608 - -(threadIdx_x + 784) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -48 - -blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 48 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 48] - -784 - -threadIdx_x + 784 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 48] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 48] - -36864 - -blockIdx_x * 36864 - -833 - -threadIdx_x + 833 - -192 - -(threadIdx_x + 833) // 192 - -4608 - -(threadIdx_x + 833) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -195 - -blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 195 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 195] - -833 - -threadIdx_x + 833 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 195] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 195] - -36864 - -blockIdx_x * 36864 - -882 - -threadIdx_x + 882 - -192 - -(threadIdx_x + 882) // 192 - -4608 - -(threadIdx_x + 882) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -342 - -blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 342 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 342] - -882 - -threadIdx_x + 882 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 342] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 342] - -36864 - -blockIdx_x * 36864 - -931 - -threadIdx_x + 931 - -192 - -(threadIdx_x + 931) // 192 - -4608 - -(threadIdx_x + 931) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 - -163 - -threadIdx_x + 163 - -192 - -(threadIdx_x + 163) % 192 - -3 - -(threadIdx_x + 163) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 163) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 163) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 163) % 192 * 3 + rx_outer_outer] - -931 - -threadIdx_x + 931 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 163) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 163) % 192 * 3 + rx_outer_outer] - -36864 - -blockIdx_x * 36864 - -980 - -threadIdx_x + 980 - -192 - -(threadIdx_x + 980) // 192 - -4608 - -(threadIdx_x + 980) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -60 - -blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 60 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 60] - -980 - -threadIdx_x + 980 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 60] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 60] - -36864 - -blockIdx_x * 36864 - -1029 - -threadIdx_x + 1029 - -192 - -(threadIdx_x + 1029) // 192 - -4608 - -(threadIdx_x + 1029) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -207 - -blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 207 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 207] - -1029 - -threadIdx_x + 1029 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 207] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 207] - -36864 - -blockIdx_x * 36864 - -1078 - -threadIdx_x + 1078 - -192 - -(threadIdx_x + 1078) // 192 - -4608 - -(threadIdx_x + 1078) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -354 - -blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 354 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 354] - -1078 - -threadIdx_x + 1078 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 354] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 354] - -36864 - -blockIdx_x * 36864 - -1127 - -threadIdx_x + 1127 - -192 - -(threadIdx_x + 1127) // 192 - -4608 - -(threadIdx_x + 1127) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 - -167 - -threadIdx_x + 167 - -192 - -(threadIdx_x + 167) % 192 - -3 - -(threadIdx_x + 167) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 167) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 167) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 167) % 192 * 3 + rx_outer_outer] - -1127 - -threadIdx_x + 1127 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 167) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 167) % 192 * 3 + rx_outer_outer] - -36864 - -blockIdx_x * 36864 - -1176 - -threadIdx_x + 1176 - -192 - -(threadIdx_x + 1176) // 192 - -4608 - -(threadIdx_x + 1176) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -72 - -blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 72 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 72] - -1176 - -threadIdx_x + 1176 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 72] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 72] - -36864 - -blockIdx_x * 36864 - -1225 - -threadIdx_x + 1225 - -192 - -(threadIdx_x + 1225) // 192 - -4608 - -(threadIdx_x + 1225) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -219 - -blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 219 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 219] - -1225 - -threadIdx_x + 1225 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 219] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 219] - -36864 - -blockIdx_x * 36864 - -1274 - -threadIdx_x + 1274 - -192 - -(threadIdx_x + 1274) // 192 - -4608 - -(threadIdx_x + 1274) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -366 - -blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 366 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 366] - -1274 - -threadIdx_x + 1274 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 366] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 366] - -36864 - -blockIdx_x * 36864 - -1323 - -threadIdx_x + 1323 - -192 - -(threadIdx_x + 1323) // 192 - -4608 - -(threadIdx_x + 1323) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 - -171 - -threadIdx_x + 171 - -192 - -(threadIdx_x + 171) % 192 - -3 - -(threadIdx_x + 171) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 171) % 192 * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 171) % 192 * 3 + rx_outer_outer - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 171) % 192 * 3 + rx_outer_outer] - -1323 - -threadIdx_x + 1323 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 171) % 192 * 3 + rx_outer_outer] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x + 171) % 192 * 3 + rx_outer_outer] - -36864 - -blockIdx_x * 36864 - -1372 - -threadIdx_x + 1372 - -192 - -(threadIdx_x + 1372) // 192 - -4608 - -(threadIdx_x + 1372) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -84 - -blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 84 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 84] - -1372 - -threadIdx_x + 1372 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 84] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 84] - -36864 - -blockIdx_x * 36864 - -1421 - -threadIdx_x + 1421 - -192 - -(threadIdx_x + 1421) // 192 - -4608 - -(threadIdx_x + 1421) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -231 - -blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 231 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 231] - -1421 - -threadIdx_x + 1421 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 231] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 231] - -36864 - -blockIdx_x * 36864 - -1470 - -threadIdx_x + 1470 - -192 - -(threadIdx_x + 1470) // 192 - -4608 - -(threadIdx_x + 1470) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -378 - -blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 378 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 378] - -1470 - -threadIdx_x + 1470 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 378] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 378] - -17 - -threadIdx_x < 17 - -36864 - -blockIdx_x * 36864 - -1519 - -threadIdx_x + 1519 - -192 - -(threadIdx_x + 1519) // 192 - -4608 - -(threadIdx_x + 1519) // 192 * 4608 - -blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 - -576 - -rc_outer_outer * 576 - -blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 - -3 - -threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 - -blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer - -525 - -blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 525 - -kernel[blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 525] - -1519 - -threadIdx_x + 1519 - -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -rc_outer_outer = T.int32() -rx_outer_outer = T.int32() -kernel_shared[threadIdx_x + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 525] - -threadIdx_x = T.int32() -if threadIdx_x < 17: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 525] - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - if threadIdx_x < 17: - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - rc_outer_outer = T.int32() - rx_outer_outer = T.int32() - kernel_shared[threadIdx_x + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x * 3 + rx_outer_outer + 525] - -0 - -8 - -0 - -conv2d_nchw[0] - -rc_outer_inner - -504 - -rc_outer_inner * 504 - -threadIdx_x - -rc_outer_inner * 504 + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] - -24 - -rc_outer_inner * 24 - -kernel_shared[rc_outer_inner * 24] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] - -24 - -rc_outer_inner * 24 - -192 - -rc_outer_inner * 24 + 192 - -kernel_shared[rc_outer_inner * 24 + 192] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] - -24 - -rc_outer_inner * 24 - -384 - -rc_outer_inner * 24 + 384 - -kernel_shared[rc_outer_inner * 24 + 384] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] - -24 - -rc_outer_inner * 24 - -576 - -rc_outer_inner * 24 + 576 - -kernel_shared[rc_outer_inner * 24 + 576] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -63 - -rc_outer_inner * 504 + threadIdx_x + 63 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] - -24 - -rc_outer_inner * 24 - -3 - -rc_outer_inner * 24 + 3 - -kernel_shared[rc_outer_inner * 24 + 3] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -63 - -rc_outer_inner * 504 + threadIdx_x + 63 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] - -24 - -rc_outer_inner * 24 - -195 - -rc_outer_inner * 24 + 195 - -kernel_shared[rc_outer_inner * 24 + 195] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -63 - -rc_outer_inner * 504 + threadIdx_x + 63 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] - -24 - -rc_outer_inner * 24 - -387 - -rc_outer_inner * 24 + 387 - -kernel_shared[rc_outer_inner * 24 + 387] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -63 - -rc_outer_inner * 504 + threadIdx_x + 63 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] - -24 - -rc_outer_inner * 24 - -579 - -rc_outer_inner * 24 + 579 - -kernel_shared[rc_outer_inner * 24 + 579] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -126 - -rc_outer_inner * 504 + threadIdx_x + 126 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] - -24 - -rc_outer_inner * 24 - -6 - -rc_outer_inner * 24 + 6 - -kernel_shared[rc_outer_inner * 24 + 6] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -126 - -rc_outer_inner * 504 + threadIdx_x + 126 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] - -24 - -rc_outer_inner * 24 - -198 - -rc_outer_inner * 24 + 198 - -kernel_shared[rc_outer_inner * 24 + 198] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -126 - -rc_outer_inner * 504 + threadIdx_x + 126 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] - -24 - -rc_outer_inner * 24 - -390 - -rc_outer_inner * 24 + 390 - -kernel_shared[rc_outer_inner * 24 + 390] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -126 - -rc_outer_inner * 504 + threadIdx_x + 126 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] - -24 - -rc_outer_inner * 24 - -582 - -rc_outer_inner * 24 + 582 - -kernel_shared[rc_outer_inner * 24 + 582] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -189 - -rc_outer_inner * 504 + threadIdx_x + 189 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] - -24 - -rc_outer_inner * 24 - -9 - -rc_outer_inner * 24 + 9 - -kernel_shared[rc_outer_inner * 24 + 9] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -189 - -rc_outer_inner * 504 + threadIdx_x + 189 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] - -24 - -rc_outer_inner * 24 - -201 - -rc_outer_inner * 24 + 201 - -kernel_shared[rc_outer_inner * 24 + 201] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -189 - -rc_outer_inner * 504 + threadIdx_x + 189 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] - -24 - -rc_outer_inner * 24 - -393 - -rc_outer_inner * 24 + 393 - -kernel_shared[rc_outer_inner * 24 + 393] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -189 - -rc_outer_inner * 504 + threadIdx_x + 189 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] - -24 - -rc_outer_inner * 24 - -585 - -rc_outer_inner * 24 + 585 - -kernel_shared[rc_outer_inner * 24 + 585] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -252 - -rc_outer_inner * 504 + threadIdx_x + 252 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] - -24 - -rc_outer_inner * 24 - -12 - -rc_outer_inner * 24 + 12 - -kernel_shared[rc_outer_inner * 24 + 12] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -252 - -rc_outer_inner * 504 + threadIdx_x + 252 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] - -24 - -rc_outer_inner * 24 - -204 - -rc_outer_inner * 24 + 204 - -kernel_shared[rc_outer_inner * 24 + 204] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -252 - -rc_outer_inner * 504 + threadIdx_x + 252 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] - -24 - -rc_outer_inner * 24 - -396 - -rc_outer_inner * 24 + 396 - -kernel_shared[rc_outer_inner * 24 + 396] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -252 - -rc_outer_inner * 504 + threadIdx_x + 252 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] - -24 - -rc_outer_inner * 24 - -588 - -rc_outer_inner * 24 + 588 - -kernel_shared[rc_outer_inner * 24 + 588] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -315 - -rc_outer_inner * 504 + threadIdx_x + 315 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] - -24 - -rc_outer_inner * 24 - -15 - -rc_outer_inner * 24 + 15 - -kernel_shared[rc_outer_inner * 24 + 15] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -315 - -rc_outer_inner * 504 + threadIdx_x + 315 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] - -24 - -rc_outer_inner * 24 - -207 - -rc_outer_inner * 24 + 207 - -kernel_shared[rc_outer_inner * 24 + 207] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -315 - -rc_outer_inner * 504 + threadIdx_x + 315 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] - -24 - -rc_outer_inner * 24 - -399 - -rc_outer_inner * 24 + 399 - -kernel_shared[rc_outer_inner * 24 + 399] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -315 - -rc_outer_inner * 504 + threadIdx_x + 315 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] - -24 - -rc_outer_inner * 24 - -591 - -rc_outer_inner * 24 + 591 - -kernel_shared[rc_outer_inner * 24 + 591] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -378 - -rc_outer_inner * 504 + threadIdx_x + 378 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] - -24 - -rc_outer_inner * 24 - -18 - -rc_outer_inner * 24 + 18 - -kernel_shared[rc_outer_inner * 24 + 18] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -378 - -rc_outer_inner * 504 + threadIdx_x + 378 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] - -24 - -rc_outer_inner * 24 - -210 - -rc_outer_inner * 24 + 210 - -kernel_shared[rc_outer_inner * 24 + 210] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -378 - -rc_outer_inner * 504 + threadIdx_x + 378 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] - -24 - -rc_outer_inner * 24 - -402 - -rc_outer_inner * 24 + 402 - -kernel_shared[rc_outer_inner * 24 + 402] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -378 - -rc_outer_inner * 504 + threadIdx_x + 378 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] - -24 - -rc_outer_inner * 24 - -594 - -rc_outer_inner * 24 + 594 - -kernel_shared[rc_outer_inner * 24 + 594] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -441 - -rc_outer_inner * 504 + threadIdx_x + 441 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] - -24 - -rc_outer_inner * 24 - -21 - -rc_outer_inner * 24 + 21 - -kernel_shared[rc_outer_inner * 24 + 21] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -441 - -rc_outer_inner * 504 + threadIdx_x + 441 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] - -24 - -rc_outer_inner * 24 - -213 - -rc_outer_inner * 24 + 213 - -kernel_shared[rc_outer_inner * 24 + 213] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -441 - -rc_outer_inner * 504 + threadIdx_x + 441 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] - -24 - -rc_outer_inner * 24 - -405 - -rc_outer_inner * 24 + 405 - -kernel_shared[rc_outer_inner * 24 + 405] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -441 - -rc_outer_inner * 504 + threadIdx_x + 441 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] - -24 - -rc_outer_inner * 24 - -597 - -rc_outer_inner * 24 + 597 - -kernel_shared[rc_outer_inner * 24 + 597] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] - -24 - -rc_outer_inner * 24 - -768 - -rc_outer_inner * 24 + 768 - -kernel_shared[rc_outer_inner * 24 + 768] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] - -24 - -rc_outer_inner * 24 - -960 - -rc_outer_inner * 24 + 960 - -kernel_shared[rc_outer_inner * 24 + 960] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] - -24 - -rc_outer_inner * 24 - -1152 - -rc_outer_inner * 24 + 1152 - -kernel_shared[rc_outer_inner * 24 + 1152] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] - -24 - -rc_outer_inner * 24 - -1344 - -rc_outer_inner * 24 + 1344 - -kernel_shared[rc_outer_inner * 24 + 1344] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -63 - -rc_outer_inner * 504 + threadIdx_x + 63 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] - -24 - -rc_outer_inner * 24 - -771 - -rc_outer_inner * 24 + 771 - -kernel_shared[rc_outer_inner * 24 + 771] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -63 - -rc_outer_inner * 504 + threadIdx_x + 63 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] - -24 - -rc_outer_inner * 24 - -963 - -rc_outer_inner * 24 + 963 - -kernel_shared[rc_outer_inner * 24 + 963] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -63 - -rc_outer_inner * 504 + threadIdx_x + 63 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] - -24 - -rc_outer_inner * 24 - -1155 - -rc_outer_inner * 24 + 1155 - -kernel_shared[rc_outer_inner * 24 + 1155] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -63 - -rc_outer_inner * 504 + threadIdx_x + 63 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] - -24 - -rc_outer_inner * 24 - -1347 - -rc_outer_inner * 24 + 1347 - -kernel_shared[rc_outer_inner * 24 + 1347] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -126 - -rc_outer_inner * 504 + threadIdx_x + 126 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] - -24 - -rc_outer_inner * 24 - -774 - -rc_outer_inner * 24 + 774 - -kernel_shared[rc_outer_inner * 24 + 774] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -126 - -rc_outer_inner * 504 + threadIdx_x + 126 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] - -24 - -rc_outer_inner * 24 - -966 - -rc_outer_inner * 24 + 966 - -kernel_shared[rc_outer_inner * 24 + 966] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -126 - -rc_outer_inner * 504 + threadIdx_x + 126 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] - -24 - -rc_outer_inner * 24 - -1158 - -rc_outer_inner * 24 + 1158 - -kernel_shared[rc_outer_inner * 24 + 1158] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -126 - -rc_outer_inner * 504 + threadIdx_x + 126 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] - -24 - -rc_outer_inner * 24 - -1350 - -rc_outer_inner * 24 + 1350 - -kernel_shared[rc_outer_inner * 24 + 1350] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -189 - -rc_outer_inner * 504 + threadIdx_x + 189 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] - -24 - -rc_outer_inner * 24 - -777 - -rc_outer_inner * 24 + 777 - -kernel_shared[rc_outer_inner * 24 + 777] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -189 - -rc_outer_inner * 504 + threadIdx_x + 189 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] - -24 - -rc_outer_inner * 24 - -969 - -rc_outer_inner * 24 + 969 - -kernel_shared[rc_outer_inner * 24 + 969] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -189 - -rc_outer_inner * 504 + threadIdx_x + 189 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] - -24 - -rc_outer_inner * 24 - -1161 - -rc_outer_inner * 24 + 1161 - -kernel_shared[rc_outer_inner * 24 + 1161] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -189 - -rc_outer_inner * 504 + threadIdx_x + 189 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] - -24 - -rc_outer_inner * 24 - -1353 - -rc_outer_inner * 24 + 1353 - -kernel_shared[rc_outer_inner * 24 + 1353] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -252 - -rc_outer_inner * 504 + threadIdx_x + 252 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] - -24 - -rc_outer_inner * 24 - -780 - -rc_outer_inner * 24 + 780 - -kernel_shared[rc_outer_inner * 24 + 780] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -252 - -rc_outer_inner * 504 + threadIdx_x + 252 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] - -24 - -rc_outer_inner * 24 - -972 - -rc_outer_inner * 24 + 972 - -kernel_shared[rc_outer_inner * 24 + 972] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -252 - -rc_outer_inner * 504 + threadIdx_x + 252 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] - -24 - -rc_outer_inner * 24 - -1164 - -rc_outer_inner * 24 + 1164 - -kernel_shared[rc_outer_inner * 24 + 1164] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -252 - -rc_outer_inner * 504 + threadIdx_x + 252 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] - -24 - -rc_outer_inner * 24 - -1356 - -rc_outer_inner * 24 + 1356 - -kernel_shared[rc_outer_inner * 24 + 1356] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -315 - -rc_outer_inner * 504 + threadIdx_x + 315 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] - -24 - -rc_outer_inner * 24 - -783 - -rc_outer_inner * 24 + 783 - -kernel_shared[rc_outer_inner * 24 + 783] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -315 - -rc_outer_inner * 504 + threadIdx_x + 315 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] - -24 - -rc_outer_inner * 24 - -975 - -rc_outer_inner * 24 + 975 - -kernel_shared[rc_outer_inner * 24 + 975] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -315 - -rc_outer_inner * 504 + threadIdx_x + 315 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] - -24 - -rc_outer_inner * 24 - -1167 - -rc_outer_inner * 24 + 1167 - -kernel_shared[rc_outer_inner * 24 + 1167] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -315 - -rc_outer_inner * 504 + threadIdx_x + 315 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] - -24 - -rc_outer_inner * 24 - -1359 - -rc_outer_inner * 24 + 1359 - -kernel_shared[rc_outer_inner * 24 + 1359] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -378 - -rc_outer_inner * 504 + threadIdx_x + 378 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] - -24 - -rc_outer_inner * 24 - -786 - -rc_outer_inner * 24 + 786 - -kernel_shared[rc_outer_inner * 24 + 786] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -378 - -rc_outer_inner * 504 + threadIdx_x + 378 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] - -24 - -rc_outer_inner * 24 - -978 - -rc_outer_inner * 24 + 978 - -kernel_shared[rc_outer_inner * 24 + 978] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -378 - -rc_outer_inner * 504 + threadIdx_x + 378 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] - -24 - -rc_outer_inner * 24 - -1170 - -rc_outer_inner * 24 + 1170 - -kernel_shared[rc_outer_inner * 24 + 1170] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -378 - -rc_outer_inner * 504 + threadIdx_x + 378 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] - -24 - -rc_outer_inner * 24 - -1362 - -rc_outer_inner * 24 + 1362 - -kernel_shared[rc_outer_inner * 24 + 1362] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -441 - -rc_outer_inner * 504 + threadIdx_x + 441 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] - -24 - -rc_outer_inner * 24 - -789 - -rc_outer_inner * 24 + 789 - -kernel_shared[rc_outer_inner * 24 + 789] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -441 - -rc_outer_inner * 504 + threadIdx_x + 441 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] - -24 - -rc_outer_inner * 24 - -981 - -rc_outer_inner * 24 + 981 - -kernel_shared[rc_outer_inner * 24 + 981] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -441 - -rc_outer_inner * 504 + threadIdx_x + 441 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] - -24 - -rc_outer_inner * 24 - -1173 - -rc_outer_inner * 24 + 1173 - -kernel_shared[rc_outer_inner * 24 + 1173] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -441 - -rc_outer_inner * 504 + threadIdx_x + 441 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] - -24 - -rc_outer_inner * 24 - -1365 - -rc_outer_inner * 24 + 1365 - -kernel_shared[rc_outer_inner * 24 + 1365] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -7 - -rc_outer_inner * 504 + threadIdx_x + 7 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] - -24 - -rc_outer_inner * 24 - -1 - -rc_outer_inner * 24 + 1 - -kernel_shared[rc_outer_inner * 24 + 1] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -7 - -rc_outer_inner * 504 + threadIdx_x + 7 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] - -24 - -rc_outer_inner * 24 - -193 - -rc_outer_inner * 24 + 193 - -kernel_shared[rc_outer_inner * 24 + 193] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -7 - -rc_outer_inner * 504 + threadIdx_x + 7 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] - -24 - -rc_outer_inner * 24 - -385 - -rc_outer_inner * 24 + 385 - -kernel_shared[rc_outer_inner * 24 + 385] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -7 - -rc_outer_inner * 504 + threadIdx_x + 7 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] - -24 - -rc_outer_inner * 24 - -577 - -rc_outer_inner * 24 + 577 - -kernel_shared[rc_outer_inner * 24 + 577] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -70 - -rc_outer_inner * 504 + threadIdx_x + 70 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] - -24 - -rc_outer_inner * 24 - -4 - -rc_outer_inner * 24 + 4 - -kernel_shared[rc_outer_inner * 24 + 4] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -70 - -rc_outer_inner * 504 + threadIdx_x + 70 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] - -24 - -rc_outer_inner * 24 - -196 - -rc_outer_inner * 24 + 196 - -kernel_shared[rc_outer_inner * 24 + 196] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -70 - -rc_outer_inner * 504 + threadIdx_x + 70 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] - -24 - -rc_outer_inner * 24 - -388 - -rc_outer_inner * 24 + 388 - -kernel_shared[rc_outer_inner * 24 + 388] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -70 - -rc_outer_inner * 504 + threadIdx_x + 70 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] - -24 - -rc_outer_inner * 24 - -580 - -rc_outer_inner * 24 + 580 - -kernel_shared[rc_outer_inner * 24 + 580] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -133 - -rc_outer_inner * 504 + threadIdx_x + 133 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] - -24 - -rc_outer_inner * 24 - -7 - -rc_outer_inner * 24 + 7 - -kernel_shared[rc_outer_inner * 24 + 7] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -133 - -rc_outer_inner * 504 + threadIdx_x + 133 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] - -24 - -rc_outer_inner * 24 - -199 - -rc_outer_inner * 24 + 199 - -kernel_shared[rc_outer_inner * 24 + 199] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -133 - -rc_outer_inner * 504 + threadIdx_x + 133 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] - -24 - -rc_outer_inner * 24 - -391 - -rc_outer_inner * 24 + 391 - -kernel_shared[rc_outer_inner * 24 + 391] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -133 - -rc_outer_inner * 504 + threadIdx_x + 133 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] - -24 - -rc_outer_inner * 24 - -583 - -rc_outer_inner * 24 + 583 - -kernel_shared[rc_outer_inner * 24 + 583] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -196 - -rc_outer_inner * 504 + threadIdx_x + 196 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] - -24 - -rc_outer_inner * 24 - -10 - -rc_outer_inner * 24 + 10 - -kernel_shared[rc_outer_inner * 24 + 10] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -196 - -rc_outer_inner * 504 + threadIdx_x + 196 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] - -24 - -rc_outer_inner * 24 - -202 - -rc_outer_inner * 24 + 202 - -kernel_shared[rc_outer_inner * 24 + 202] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -196 - -rc_outer_inner * 504 + threadIdx_x + 196 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] - -24 - -rc_outer_inner * 24 - -394 - -rc_outer_inner * 24 + 394 - -kernel_shared[rc_outer_inner * 24 + 394] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -196 - -rc_outer_inner * 504 + threadIdx_x + 196 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] - -24 - -rc_outer_inner * 24 - -586 - -rc_outer_inner * 24 + 586 - -kernel_shared[rc_outer_inner * 24 + 586] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -259 - -rc_outer_inner * 504 + threadIdx_x + 259 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] - -24 - -rc_outer_inner * 24 - -13 - -rc_outer_inner * 24 + 13 - -kernel_shared[rc_outer_inner * 24 + 13] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -259 - -rc_outer_inner * 504 + threadIdx_x + 259 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] - -24 - -rc_outer_inner * 24 - -205 - -rc_outer_inner * 24 + 205 - -kernel_shared[rc_outer_inner * 24 + 205] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -259 - -rc_outer_inner * 504 + threadIdx_x + 259 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] - -24 - -rc_outer_inner * 24 - -397 - -rc_outer_inner * 24 + 397 - -kernel_shared[rc_outer_inner * 24 + 397] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -259 - -rc_outer_inner * 504 + threadIdx_x + 259 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] - -24 - -rc_outer_inner * 24 - -589 - -rc_outer_inner * 24 + 589 - -kernel_shared[rc_outer_inner * 24 + 589] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -322 - -rc_outer_inner * 504 + threadIdx_x + 322 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] - -24 - -rc_outer_inner * 24 - -16 - -rc_outer_inner * 24 + 16 - -kernel_shared[rc_outer_inner * 24 + 16] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -322 - -rc_outer_inner * 504 + threadIdx_x + 322 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] - -24 - -rc_outer_inner * 24 - -208 - -rc_outer_inner * 24 + 208 - -kernel_shared[rc_outer_inner * 24 + 208] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -322 - -rc_outer_inner * 504 + threadIdx_x + 322 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] - -24 - -rc_outer_inner * 24 - -400 - -rc_outer_inner * 24 + 400 - -kernel_shared[rc_outer_inner * 24 + 400] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -322 - -rc_outer_inner * 504 + threadIdx_x + 322 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] - -24 - -rc_outer_inner * 24 - -592 - -rc_outer_inner * 24 + 592 - -kernel_shared[rc_outer_inner * 24 + 592] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -385 - -rc_outer_inner * 504 + threadIdx_x + 385 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] - -24 - -rc_outer_inner * 24 - -19 - -rc_outer_inner * 24 + 19 - -kernel_shared[rc_outer_inner * 24 + 19] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -385 - -rc_outer_inner * 504 + threadIdx_x + 385 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] - -24 - -rc_outer_inner * 24 - -211 - -rc_outer_inner * 24 + 211 - -kernel_shared[rc_outer_inner * 24 + 211] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -385 - -rc_outer_inner * 504 + threadIdx_x + 385 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] - -24 - -rc_outer_inner * 24 - -403 - -rc_outer_inner * 24 + 403 - -kernel_shared[rc_outer_inner * 24 + 403] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -385 - -rc_outer_inner * 504 + threadIdx_x + 385 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] - -24 - -rc_outer_inner * 24 - -595 - -rc_outer_inner * 24 + 595 - -kernel_shared[rc_outer_inner * 24 + 595] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -448 - -rc_outer_inner * 504 + threadIdx_x + 448 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] - -24 - -rc_outer_inner * 24 - -22 - -rc_outer_inner * 24 + 22 - -kernel_shared[rc_outer_inner * 24 + 22] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -448 - -rc_outer_inner * 504 + threadIdx_x + 448 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] - -24 - -rc_outer_inner * 24 - -214 - -rc_outer_inner * 24 + 214 - -kernel_shared[rc_outer_inner * 24 + 214] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -448 - -rc_outer_inner * 504 + threadIdx_x + 448 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] - -24 - -rc_outer_inner * 24 - -406 - -rc_outer_inner * 24 + 406 - -kernel_shared[rc_outer_inner * 24 + 406] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -448 - -rc_outer_inner * 504 + threadIdx_x + 448 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] - -24 - -rc_outer_inner * 24 - -598 - -rc_outer_inner * 24 + 598 - -kernel_shared[rc_outer_inner * 24 + 598] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -7 - -rc_outer_inner * 504 + threadIdx_x + 7 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] - -24 - -rc_outer_inner * 24 - -769 - -rc_outer_inner * 24 + 769 - -kernel_shared[rc_outer_inner * 24 + 769] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -7 - -rc_outer_inner * 504 + threadIdx_x + 7 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] - -24 - -rc_outer_inner * 24 - -961 - -rc_outer_inner * 24 + 961 - -kernel_shared[rc_outer_inner * 24 + 961] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -7 - -rc_outer_inner * 504 + threadIdx_x + 7 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] - -24 - -rc_outer_inner * 24 - -1153 - -rc_outer_inner * 24 + 1153 - -kernel_shared[rc_outer_inner * 24 + 1153] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -7 - -rc_outer_inner * 504 + threadIdx_x + 7 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] - -24 - -rc_outer_inner * 24 - -1345 - -rc_outer_inner * 24 + 1345 - -kernel_shared[rc_outer_inner * 24 + 1345] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -70 - -rc_outer_inner * 504 + threadIdx_x + 70 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] - -24 - -rc_outer_inner * 24 - -772 - -rc_outer_inner * 24 + 772 - -kernel_shared[rc_outer_inner * 24 + 772] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -70 - -rc_outer_inner * 504 + threadIdx_x + 70 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] - -24 - -rc_outer_inner * 24 - -964 - -rc_outer_inner * 24 + 964 - -kernel_shared[rc_outer_inner * 24 + 964] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -70 - -rc_outer_inner * 504 + threadIdx_x + 70 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] - -24 - -rc_outer_inner * 24 - -1156 - -rc_outer_inner * 24 + 1156 - -kernel_shared[rc_outer_inner * 24 + 1156] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -70 - -rc_outer_inner * 504 + threadIdx_x + 70 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] - -24 - -rc_outer_inner * 24 - -1348 - -rc_outer_inner * 24 + 1348 - -kernel_shared[rc_outer_inner * 24 + 1348] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -133 - -rc_outer_inner * 504 + threadIdx_x + 133 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] - -24 - -rc_outer_inner * 24 - -775 - -rc_outer_inner * 24 + 775 - -kernel_shared[rc_outer_inner * 24 + 775] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -133 - -rc_outer_inner * 504 + threadIdx_x + 133 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] - -24 - -rc_outer_inner * 24 - -967 - -rc_outer_inner * 24 + 967 - -kernel_shared[rc_outer_inner * 24 + 967] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -133 - -rc_outer_inner * 504 + threadIdx_x + 133 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] - -24 - -rc_outer_inner * 24 - -1159 - -rc_outer_inner * 24 + 1159 - -kernel_shared[rc_outer_inner * 24 + 1159] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -133 - -rc_outer_inner * 504 + threadIdx_x + 133 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] - -24 - -rc_outer_inner * 24 - -1351 - -rc_outer_inner * 24 + 1351 - -kernel_shared[rc_outer_inner * 24 + 1351] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -196 - -rc_outer_inner * 504 + threadIdx_x + 196 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] - -24 - -rc_outer_inner * 24 - -778 - -rc_outer_inner * 24 + 778 - -kernel_shared[rc_outer_inner * 24 + 778] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -196 - -rc_outer_inner * 504 + threadIdx_x + 196 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] - -24 - -rc_outer_inner * 24 - -970 - -rc_outer_inner * 24 + 970 - -kernel_shared[rc_outer_inner * 24 + 970] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -196 - -rc_outer_inner * 504 + threadIdx_x + 196 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] - -24 - -rc_outer_inner * 24 - -1162 - -rc_outer_inner * 24 + 1162 - -kernel_shared[rc_outer_inner * 24 + 1162] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -196 - -rc_outer_inner * 504 + threadIdx_x + 196 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] - -24 - -rc_outer_inner * 24 - -1354 - -rc_outer_inner * 24 + 1354 - -kernel_shared[rc_outer_inner * 24 + 1354] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -259 - -rc_outer_inner * 504 + threadIdx_x + 259 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] - -24 - -rc_outer_inner * 24 - -781 - -rc_outer_inner * 24 + 781 - -kernel_shared[rc_outer_inner * 24 + 781] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -259 - -rc_outer_inner * 504 + threadIdx_x + 259 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] - -24 - -rc_outer_inner * 24 - -973 - -rc_outer_inner * 24 + 973 - -kernel_shared[rc_outer_inner * 24 + 973] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -259 - -rc_outer_inner * 504 + threadIdx_x + 259 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] - -24 - -rc_outer_inner * 24 - -1165 - -rc_outer_inner * 24 + 1165 - -kernel_shared[rc_outer_inner * 24 + 1165] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -259 - -rc_outer_inner * 504 + threadIdx_x + 259 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] - -24 - -rc_outer_inner * 24 - -1357 - -rc_outer_inner * 24 + 1357 - -kernel_shared[rc_outer_inner * 24 + 1357] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -322 - -rc_outer_inner * 504 + threadIdx_x + 322 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] - -24 - -rc_outer_inner * 24 - -784 - -rc_outer_inner * 24 + 784 - -kernel_shared[rc_outer_inner * 24 + 784] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -322 - -rc_outer_inner * 504 + threadIdx_x + 322 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] - -24 - -rc_outer_inner * 24 - -976 - -rc_outer_inner * 24 + 976 - -kernel_shared[rc_outer_inner * 24 + 976] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -322 - -rc_outer_inner * 504 + threadIdx_x + 322 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] - -24 - -rc_outer_inner * 24 - -1168 - -rc_outer_inner * 24 + 1168 - -kernel_shared[rc_outer_inner * 24 + 1168] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -322 - -rc_outer_inner * 504 + threadIdx_x + 322 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] - -24 - -rc_outer_inner * 24 - -1360 - -rc_outer_inner * 24 + 1360 - -kernel_shared[rc_outer_inner * 24 + 1360] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -385 - -rc_outer_inner * 504 + threadIdx_x + 385 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] - -24 - -rc_outer_inner * 24 - -787 - -rc_outer_inner * 24 + 787 - -kernel_shared[rc_outer_inner * 24 + 787] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -385 - -rc_outer_inner * 504 + threadIdx_x + 385 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] - -24 - -rc_outer_inner * 24 - -979 - -rc_outer_inner * 24 + 979 - -kernel_shared[rc_outer_inner * 24 + 979] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -385 - -rc_outer_inner * 504 + threadIdx_x + 385 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] - -24 - -rc_outer_inner * 24 - -1171 - -rc_outer_inner * 24 + 1171 - -kernel_shared[rc_outer_inner * 24 + 1171] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -385 - -rc_outer_inner * 504 + threadIdx_x + 385 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] - -24 - -rc_outer_inner * 24 - -1363 - -rc_outer_inner * 24 + 1363 - -kernel_shared[rc_outer_inner * 24 + 1363] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -448 - -rc_outer_inner * 504 + threadIdx_x + 448 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] - -24 - -rc_outer_inner * 24 - -790 - -rc_outer_inner * 24 + 790 - -kernel_shared[rc_outer_inner * 24 + 790] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -448 - -rc_outer_inner * 504 + threadIdx_x + 448 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] - -24 - -rc_outer_inner * 24 - -982 - -rc_outer_inner * 24 + 982 - -kernel_shared[rc_outer_inner * 24 + 982] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -448 - -rc_outer_inner * 504 + threadIdx_x + 448 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] - -24 - -rc_outer_inner * 24 - -1174 - -rc_outer_inner * 24 + 1174 - -kernel_shared[rc_outer_inner * 24 + 1174] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -448 - -rc_outer_inner * 504 + threadIdx_x + 448 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] - -24 - -rc_outer_inner * 24 - -1366 - -rc_outer_inner * 24 + 1366 - -kernel_shared[rc_outer_inner * 24 + 1366] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -14 - -rc_outer_inner * 504 + threadIdx_x + 14 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] - -24 - -rc_outer_inner * 24 - -2 - -rc_outer_inner * 24 + 2 - -kernel_shared[rc_outer_inner * 24 + 2] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -14 - -rc_outer_inner * 504 + threadIdx_x + 14 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] - -24 - -rc_outer_inner * 24 - -194 - -rc_outer_inner * 24 + 194 - -kernel_shared[rc_outer_inner * 24 + 194] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -14 - -rc_outer_inner * 504 + threadIdx_x + 14 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] - -24 - -rc_outer_inner * 24 - -386 - -rc_outer_inner * 24 + 386 - -kernel_shared[rc_outer_inner * 24 + 386] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -14 - -rc_outer_inner * 504 + threadIdx_x + 14 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] - -24 - -rc_outer_inner * 24 - -578 - -rc_outer_inner * 24 + 578 - -kernel_shared[rc_outer_inner * 24 + 578] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -77 - -rc_outer_inner * 504 + threadIdx_x + 77 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] - -24 - -rc_outer_inner * 24 - -5 - -rc_outer_inner * 24 + 5 - -kernel_shared[rc_outer_inner * 24 + 5] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -77 - -rc_outer_inner * 504 + threadIdx_x + 77 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] - -24 - -rc_outer_inner * 24 - -197 - -rc_outer_inner * 24 + 197 - -kernel_shared[rc_outer_inner * 24 + 197] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -77 - -rc_outer_inner * 504 + threadIdx_x + 77 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] - -24 - -rc_outer_inner * 24 - -389 - -rc_outer_inner * 24 + 389 - -kernel_shared[rc_outer_inner * 24 + 389] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -77 - -rc_outer_inner * 504 + threadIdx_x + 77 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] - -24 - -rc_outer_inner * 24 - -581 - -rc_outer_inner * 24 + 581 - -kernel_shared[rc_outer_inner * 24 + 581] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -140 - -rc_outer_inner * 504 + threadIdx_x + 140 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] - -24 - -rc_outer_inner * 24 - -8 - -rc_outer_inner * 24 + 8 - -kernel_shared[rc_outer_inner * 24 + 8] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -140 - -rc_outer_inner * 504 + threadIdx_x + 140 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] - -24 - -rc_outer_inner * 24 - -200 - -rc_outer_inner * 24 + 200 - -kernel_shared[rc_outer_inner * 24 + 200] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -140 - -rc_outer_inner * 504 + threadIdx_x + 140 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] - -24 - -rc_outer_inner * 24 - -392 - -rc_outer_inner * 24 + 392 - -kernel_shared[rc_outer_inner * 24 + 392] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -140 - -rc_outer_inner * 504 + threadIdx_x + 140 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] - -24 - -rc_outer_inner * 24 - -584 - -rc_outer_inner * 24 + 584 - -kernel_shared[rc_outer_inner * 24 + 584] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -203 - -rc_outer_inner * 504 + threadIdx_x + 203 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] - -24 - -rc_outer_inner * 24 - -11 - -rc_outer_inner * 24 + 11 - -kernel_shared[rc_outer_inner * 24 + 11] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -203 - -rc_outer_inner * 504 + threadIdx_x + 203 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] - -24 - -rc_outer_inner * 24 - -203 - -rc_outer_inner * 24 + 203 - -kernel_shared[rc_outer_inner * 24 + 203] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -203 - -rc_outer_inner * 504 + threadIdx_x + 203 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] - -24 - -rc_outer_inner * 24 - -395 - -rc_outer_inner * 24 + 395 - -kernel_shared[rc_outer_inner * 24 + 395] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -203 - -rc_outer_inner * 504 + threadIdx_x + 203 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] - -24 - -rc_outer_inner * 24 - -587 - -rc_outer_inner * 24 + 587 - -kernel_shared[rc_outer_inner * 24 + 587] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -266 - -rc_outer_inner * 504 + threadIdx_x + 266 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] - -24 - -rc_outer_inner * 24 - -14 - -rc_outer_inner * 24 + 14 - -kernel_shared[rc_outer_inner * 24 + 14] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -266 - -rc_outer_inner * 504 + threadIdx_x + 266 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] - -24 - -rc_outer_inner * 24 - -206 - -rc_outer_inner * 24 + 206 - -kernel_shared[rc_outer_inner * 24 + 206] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -266 - -rc_outer_inner * 504 + threadIdx_x + 266 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] - -24 - -rc_outer_inner * 24 - -398 - -rc_outer_inner * 24 + 398 - -kernel_shared[rc_outer_inner * 24 + 398] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -266 - -rc_outer_inner * 504 + threadIdx_x + 266 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] - -24 - -rc_outer_inner * 24 - -590 - -rc_outer_inner * 24 + 590 - -kernel_shared[rc_outer_inner * 24 + 590] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -329 - -rc_outer_inner * 504 + threadIdx_x + 329 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] - -24 - -rc_outer_inner * 24 - -17 - -rc_outer_inner * 24 + 17 - -kernel_shared[rc_outer_inner * 24 + 17] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -329 - -rc_outer_inner * 504 + threadIdx_x + 329 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] - -24 - -rc_outer_inner * 24 - -209 - -rc_outer_inner * 24 + 209 - -kernel_shared[rc_outer_inner * 24 + 209] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -329 - -rc_outer_inner * 504 + threadIdx_x + 329 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] - -24 - -rc_outer_inner * 24 - -401 - -rc_outer_inner * 24 + 401 - -kernel_shared[rc_outer_inner * 24 + 401] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -329 - -rc_outer_inner * 504 + threadIdx_x + 329 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] - -24 - -rc_outer_inner * 24 - -593 - -rc_outer_inner * 24 + 593 - -kernel_shared[rc_outer_inner * 24 + 593] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -392 - -rc_outer_inner * 504 + threadIdx_x + 392 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] - -24 - -rc_outer_inner * 24 - -20 - -rc_outer_inner * 24 + 20 - -kernel_shared[rc_outer_inner * 24 + 20] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -392 - -rc_outer_inner * 504 + threadIdx_x + 392 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] - -24 - -rc_outer_inner * 24 - -212 - -rc_outer_inner * 24 + 212 - -kernel_shared[rc_outer_inner * 24 + 212] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -392 - -rc_outer_inner * 504 + threadIdx_x + 392 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] - -24 - -rc_outer_inner * 24 - -404 - -rc_outer_inner * 24 + 404 - -kernel_shared[rc_outer_inner * 24 + 404] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -392 - -rc_outer_inner * 504 + threadIdx_x + 392 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] - -24 - -rc_outer_inner * 24 - -596 - -rc_outer_inner * 24 + 596 - -kernel_shared[rc_outer_inner * 24 + 596] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] - -0 - -conv2d_nchw[0] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -455 - -rc_outer_inner * 504 + threadIdx_x + 455 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] - -24 - -rc_outer_inner * 24 - -23 - -rc_outer_inner * 24 + 23 - -kernel_shared[rc_outer_inner * 24 + 23] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] - -conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] - -0 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] - -1 - -conv2d_nchw[1] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -455 - -rc_outer_inner * 504 + threadIdx_x + 455 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] - -24 - -rc_outer_inner * 24 - -215 - -rc_outer_inner * 24 + 215 - -kernel_shared[rc_outer_inner * 24 + 215] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] - -conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] - -1 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] - -2 - -conv2d_nchw[2] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -455 - -rc_outer_inner * 504 + threadIdx_x + 455 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] - -24 - -rc_outer_inner * 24 - -407 - -rc_outer_inner * 24 + 407 - -kernel_shared[rc_outer_inner * 24 + 407] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] - -conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] - -2 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] - -3 - -conv2d_nchw[3] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -455 - -rc_outer_inner * 504 + threadIdx_x + 455 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] - -24 - -rc_outer_inner * 24 - -599 - -rc_outer_inner * 24 + 599 - -kernel_shared[rc_outer_inner * 24 + 599] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] - -conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] - -3 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -14 - -rc_outer_inner * 504 + threadIdx_x + 14 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] - -24 - -rc_outer_inner * 24 - -770 - -rc_outer_inner * 24 + 770 - -kernel_shared[rc_outer_inner * 24 + 770] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -14 - -rc_outer_inner * 504 + threadIdx_x + 14 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] - -24 - -rc_outer_inner * 24 - -962 - -rc_outer_inner * 24 + 962 - -kernel_shared[rc_outer_inner * 24 + 962] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -14 - -rc_outer_inner * 504 + threadIdx_x + 14 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] - -24 - -rc_outer_inner * 24 - -1154 - -rc_outer_inner * 24 + 1154 - -kernel_shared[rc_outer_inner * 24 + 1154] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -14 - -rc_outer_inner * 504 + threadIdx_x + 14 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] - -24 - -rc_outer_inner * 24 - -1346 - -rc_outer_inner * 24 + 1346 - -kernel_shared[rc_outer_inner * 24 + 1346] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -77 - -rc_outer_inner * 504 + threadIdx_x + 77 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] - -24 - -rc_outer_inner * 24 - -773 - -rc_outer_inner * 24 + 773 - -kernel_shared[rc_outer_inner * 24 + 773] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -77 - -rc_outer_inner * 504 + threadIdx_x + 77 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] - -24 - -rc_outer_inner * 24 - -965 - -rc_outer_inner * 24 + 965 - -kernel_shared[rc_outer_inner * 24 + 965] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -77 - -rc_outer_inner * 504 + threadIdx_x + 77 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] - -24 - -rc_outer_inner * 24 - -1157 - -rc_outer_inner * 24 + 1157 - -kernel_shared[rc_outer_inner * 24 + 1157] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -77 - -rc_outer_inner * 504 + threadIdx_x + 77 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] - -24 - -rc_outer_inner * 24 - -1349 - -rc_outer_inner * 24 + 1349 - -kernel_shared[rc_outer_inner * 24 + 1349] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -140 - -rc_outer_inner * 504 + threadIdx_x + 140 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] - -24 - -rc_outer_inner * 24 - -776 - -rc_outer_inner * 24 + 776 - -kernel_shared[rc_outer_inner * 24 + 776] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -140 - -rc_outer_inner * 504 + threadIdx_x + 140 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] - -24 - -rc_outer_inner * 24 - -968 - -rc_outer_inner * 24 + 968 - -kernel_shared[rc_outer_inner * 24 + 968] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -140 - -rc_outer_inner * 504 + threadIdx_x + 140 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] - -24 - -rc_outer_inner * 24 - -1160 - -rc_outer_inner * 24 + 1160 - -kernel_shared[rc_outer_inner * 24 + 1160] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -140 - -rc_outer_inner * 504 + threadIdx_x + 140 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] - -24 - -rc_outer_inner * 24 - -1352 - -rc_outer_inner * 24 + 1352 - -kernel_shared[rc_outer_inner * 24 + 1352] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -203 - -rc_outer_inner * 504 + threadIdx_x + 203 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] - -24 - -rc_outer_inner * 24 - -779 - -rc_outer_inner * 24 + 779 - -kernel_shared[rc_outer_inner * 24 + 779] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -203 - -rc_outer_inner * 504 + threadIdx_x + 203 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] - -24 - -rc_outer_inner * 24 - -971 - -rc_outer_inner * 24 + 971 - -kernel_shared[rc_outer_inner * 24 + 971] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -203 - -rc_outer_inner * 504 + threadIdx_x + 203 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] - -24 - -rc_outer_inner * 24 - -1163 - -rc_outer_inner * 24 + 1163 - -kernel_shared[rc_outer_inner * 24 + 1163] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -203 - -rc_outer_inner * 504 + threadIdx_x + 203 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] - -24 - -rc_outer_inner * 24 - -1355 - -rc_outer_inner * 24 + 1355 - -kernel_shared[rc_outer_inner * 24 + 1355] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -266 - -rc_outer_inner * 504 + threadIdx_x + 266 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] - -24 - -rc_outer_inner * 24 - -782 - -rc_outer_inner * 24 + 782 - -kernel_shared[rc_outer_inner * 24 + 782] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -266 - -rc_outer_inner * 504 + threadIdx_x + 266 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] - -24 - -rc_outer_inner * 24 - -974 - -rc_outer_inner * 24 + 974 - -kernel_shared[rc_outer_inner * 24 + 974] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -266 - -rc_outer_inner * 504 + threadIdx_x + 266 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] - -24 - -rc_outer_inner * 24 - -1166 - -rc_outer_inner * 24 + 1166 - -kernel_shared[rc_outer_inner * 24 + 1166] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -266 - -rc_outer_inner * 504 + threadIdx_x + 266 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] - -24 - -rc_outer_inner * 24 - -1358 - -rc_outer_inner * 24 + 1358 - -kernel_shared[rc_outer_inner * 24 + 1358] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -329 - -rc_outer_inner * 504 + threadIdx_x + 329 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] - -24 - -rc_outer_inner * 24 - -785 - -rc_outer_inner * 24 + 785 - -kernel_shared[rc_outer_inner * 24 + 785] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -329 - -rc_outer_inner * 504 + threadIdx_x + 329 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] - -24 - -rc_outer_inner * 24 - -977 - -rc_outer_inner * 24 + 977 - -kernel_shared[rc_outer_inner * 24 + 977] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -329 - -rc_outer_inner * 504 + threadIdx_x + 329 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] - -24 - -rc_outer_inner * 24 - -1169 - -rc_outer_inner * 24 + 1169 - -kernel_shared[rc_outer_inner * 24 + 1169] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -329 - -rc_outer_inner * 504 + threadIdx_x + 329 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] - -24 - -rc_outer_inner * 24 - -1361 - -rc_outer_inner * 24 + 1361 - -kernel_shared[rc_outer_inner * 24 + 1361] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -392 - -rc_outer_inner * 504 + threadIdx_x + 392 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] - -24 - -rc_outer_inner * 24 - -788 - -rc_outer_inner * 24 + 788 - -kernel_shared[rc_outer_inner * 24 + 788] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -392 - -rc_outer_inner * 504 + threadIdx_x + 392 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] - -24 - -rc_outer_inner * 24 - -980 - -rc_outer_inner * 24 + 980 - -kernel_shared[rc_outer_inner * 24 + 980] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -392 - -rc_outer_inner * 504 + threadIdx_x + 392 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] - -24 - -rc_outer_inner * 24 - -1172 - -rc_outer_inner * 24 + 1172 - -kernel_shared[rc_outer_inner * 24 + 1172] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -392 - -rc_outer_inner * 504 + threadIdx_x + 392 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] - -24 - -rc_outer_inner * 24 - -1364 - -rc_outer_inner * 24 + 1364 - -kernel_shared[rc_outer_inner * 24 + 1364] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] - -4 - -conv2d_nchw[4] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -455 - -rc_outer_inner * 504 + threadIdx_x + 455 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] - -24 - -rc_outer_inner * 24 - -791 - -rc_outer_inner * 24 + 791 - -kernel_shared[rc_outer_inner * 24 + 791] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] - -conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] - -4 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] - -5 - -conv2d_nchw[5] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -455 - -rc_outer_inner * 504 + threadIdx_x + 455 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] - -24 - -rc_outer_inner * 24 - -983 - -rc_outer_inner * 24 + 983 - -kernel_shared[rc_outer_inner * 24 + 983] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] - -conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] - -5 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] - -6 - -conv2d_nchw[6] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -455 - -rc_outer_inner * 504 + threadIdx_x + 455 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] - -24 - -rc_outer_inner * 24 - -1175 - -rc_outer_inner * 24 + 1175 - -kernel_shared[rc_outer_inner * 24 + 1175] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] - -conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] - -6 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] - -7 - -conv2d_nchw[7] - -504 - -rc_outer_inner * 504 - -rc_outer_inner * 504 + threadIdx_x - -455 - -rc_outer_inner * 504 + threadIdx_x + 455 - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] - -24 - -rc_outer_inner * 24 - -1367 - -rc_outer_inner * 24 + 1367 - -kernel_shared[rc_outer_inner * 24 + 1367] - -pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] - -conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] - -7 - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -pad_temp_shared = T.Buffer((4032,), scope="shared") -rc_outer_inner = T.int32() -threadIdx_x = T.int32() -kernel_shared = T.Buffer((1536,), scope="shared") -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] -conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] -conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] -conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] -conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] -conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] -conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] -conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] -conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] - -for rc_outer_inner in range(8): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - pad_temp_shared = T.Buffer((4032,), scope="shared") - threadIdx_x = T.int32() - kernel_shared = T.Buffer((1536,), scope="shared") - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] - -threadIdx_x = T.env_thread("threadIdx.x") -pad_temp_shared = T.Buffer((4032,), scope="shared") -rx_outer_outer = T.int32() -data = T.Buffer((25088,)) -rc_outer_outer = T.int32() -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) -with T.launch_thread(threadIdx_x, 49): - if threadIdx_x < 14: - pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) -threadIdx_x_1 = T.env_thread("threadIdx.x") -kernel_shared = T.Buffer((1536,), scope="shared") -kernel = T.Buffer((2359296,)) -blockIdx_x = T.int32() -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 147] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 294] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 147) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 12] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 159] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 306] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 151) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 24] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 171] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 318] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 155) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 36] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 183] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 330] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 159) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 48] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 195] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 342] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 163) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 60] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 207] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 354] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 167) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 72] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 219] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 366] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 171) % 192 * 3 + rx_outer_outer] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 84] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 231] -with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 378] -with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 17: - kernel_shared[threadIdx_x_1 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 525] -for rc_outer_inner in range(8): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - threadIdx_x_2 = T.int32() - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 192] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 384] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 576] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 3] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 195] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 387] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 579] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 6] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 198] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 390] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 582] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 9] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 201] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 393] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 585] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 12] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 204] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 396] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 588] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 15] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 207] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 399] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 591] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 18] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 210] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 402] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 594] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 21] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 213] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 405] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 597] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 768] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 960] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1152] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1344] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 771] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 963] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1155] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1347] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 774] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 966] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1158] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1350] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 777] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 969] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1161] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1353] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 780] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 972] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1164] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1356] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 783] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 975] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1167] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1359] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 786] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 978] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1170] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1362] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 789] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 981] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1173] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1365] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 193] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 385] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 577] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 4] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 196] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 388] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 580] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 7] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 199] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 391] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 583] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 10] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 202] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 394] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 586] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 13] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 205] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 397] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 589] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 16] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 208] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 400] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 592] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 19] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 211] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 403] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 595] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 22] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 214] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 406] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 598] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 769] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 961] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1153] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1345] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 772] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 964] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1156] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1348] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 775] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 967] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1159] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1351] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 778] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 970] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1162] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1354] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 781] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 973] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1165] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1357] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 784] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 976] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1168] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1360] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 787] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 979] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1171] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1363] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 790] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 982] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1174] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1366] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 2] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 194] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 386] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 578] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 5] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 197] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 389] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 581] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 8] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 200] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 392] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 584] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 11] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 203] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 395] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 587] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 14] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 206] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 398] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 590] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 17] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 209] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 401] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 593] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 20] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 212] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 404] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 596] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 23] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 215] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 407] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 599] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 770] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 962] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1154] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1346] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 773] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 965] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1157] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1349] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 776] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 968] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1160] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1352] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 779] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 971] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1163] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1355] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 782] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 974] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1166] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1358] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 785] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 977] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1169] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1361] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 788] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 980] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1172] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1364] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 791] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 983] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1175] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1367] - -for rx_outer_outer in range(3): - threadIdx_x = T.env_thread("threadIdx.x") - pad_temp_shared = T.Buffer((4032,), scope="shared") - data = T.Buffer((25088,)) - rc_outer_outer = T.int32() - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if threadIdx_x < 14: - pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) - threadIdx_x_1 = T.env_thread("threadIdx.x") - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 17: - kernel_shared[threadIdx_x_1 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - threadIdx_x_2 = T.int32() - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 192] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 384] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 576] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 3] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 195] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 387] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 579] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 6] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 198] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 390] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 582] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 9] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 201] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 393] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 585] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 12] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 204] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 396] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 588] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 15] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 207] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 399] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 591] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 18] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 210] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 402] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 594] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 21] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 213] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 405] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 597] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 768] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 960] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1152] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1344] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 771] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 963] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1155] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1347] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 774] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 966] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1158] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1350] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 777] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 969] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1161] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1353] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 780] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 972] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1164] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1356] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 783] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 975] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1167] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1359] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 786] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 978] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1170] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1362] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 789] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 981] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1173] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1365] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 193] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 385] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 577] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 4] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 196] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 388] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 580] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 7] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 199] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 391] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 583] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 10] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 202] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 394] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 586] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 13] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 205] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 397] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 589] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 16] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 208] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 400] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 592] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 19] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 211] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 403] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 595] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 22] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 214] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 406] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 598] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 769] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 961] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1153] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1345] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 772] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 964] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1156] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1348] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 775] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 967] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1159] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1351] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 778] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 970] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1162] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1354] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 781] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 973] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1165] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1357] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 784] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 976] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1168] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1360] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 787] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 979] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1171] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1363] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 790] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 982] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1174] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1366] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 2] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 194] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 386] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 578] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 5] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 197] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 389] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 581] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 8] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 200] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 392] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 584] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 11] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 203] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 395] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 587] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 14] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 206] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 398] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 590] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 17] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 209] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 401] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 593] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 20] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 212] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 404] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 596] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 23] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 215] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 407] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 599] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 770] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 962] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1154] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1346] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 773] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 965] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1157] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1349] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 776] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 968] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1160] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1352] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 779] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 971] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1163] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1355] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 782] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 974] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1166] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1358] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 785] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 977] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1169] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1361] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 788] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 980] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1172] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1364] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 791] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 983] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1175] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1367] - -for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x = T.env_thread("threadIdx.x") - pad_temp_shared = T.Buffer((4032,), scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if threadIdx_x < 14: - pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) - threadIdx_x_1 = T.env_thread("threadIdx.x") - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - blockIdx_x = T.int32() - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 17: - kernel_shared[threadIdx_x_1 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - threadIdx_x_2 = T.int32() - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 192] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 384] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 576] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 3] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 195] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 387] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 579] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 6] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 198] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 390] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 582] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 9] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 201] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 393] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 585] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 12] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 204] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 396] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 588] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 15] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 207] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 399] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 591] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 18] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 210] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 402] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 594] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 21] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 213] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 405] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 597] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 768] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 960] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1152] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1344] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 771] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 963] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1155] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1347] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 774] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 966] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1158] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1350] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 777] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 969] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1161] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1353] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 780] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 972] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1164] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1356] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 783] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 975] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1167] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1359] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 786] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 978] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1170] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1362] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 789] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 981] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1173] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1365] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 193] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 385] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 577] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 4] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 196] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 388] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 580] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 7] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 199] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 391] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 583] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 10] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 202] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 394] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 586] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 13] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 205] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 397] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 589] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 16] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 208] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 400] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 592] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 19] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 211] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 403] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 595] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 22] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 214] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 406] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 598] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 769] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 961] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1153] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1345] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 772] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 964] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1156] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1348] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 775] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 967] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1159] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1351] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 778] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 970] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1162] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1354] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 781] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 973] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1165] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1357] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 784] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 976] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1168] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1360] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 787] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 979] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1171] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1363] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 790] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 982] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1174] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1366] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 2] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 194] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 386] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 578] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 5] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 197] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 389] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 581] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 8] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 200] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 392] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 584] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 11] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 203] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 395] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 587] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 14] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 206] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 398] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 590] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 17] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 209] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 401] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 593] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 20] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 212] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 404] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 596] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 23] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 215] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 407] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 599] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 770] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 962] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1154] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1346] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 773] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 965] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1157] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1349] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 776] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 968] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1160] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1352] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 779] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 971] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1163] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1355] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 782] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 974] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1166] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1358] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 785] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 977] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1169] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1361] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 788] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 980] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1172] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1364] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 791] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 983] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1175] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1367] - -0 - -8 - -i1_inner - -conv2d_nchw[i1_inner] - -8 - -blockIdx_x * 8 - -blockIdx_x * 8 + i1_inner - -bias[blockIdx_x * 8 + i1_inner] - -conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner] - -T.float32(0.0) - -T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -392 - -blockIdx_x * 392 - -49 - -i1_inner * 49 - -blockIdx_x * 392 + i1_inner * 49 - -blockIdx_x * 392 + i1_inner * 49 + threadIdx_x - -compute = T.Buffer((25088,)) -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -i1_inner = T.int32() -bias = T.Buffer((512,)) -blockIdx_x = T.int32() -threadIdx_x = T.int32() -compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -for i1_inner in range(8): - compute = T.Buffer((25088,)) - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - bias = T.Buffer((512,)) - blockIdx_x = T.int32() - threadIdx_x = T.int32() - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -conv2d_nchw = T.Buffer((8,), scope="local", align=32) -conv2d_nchw[0] = T.float32(0.0) -conv2d_nchw[1] = T.float32(0.0) -conv2d_nchw[2] = T.float32(0.0) -conv2d_nchw[3] = T.float32(0.0) -conv2d_nchw[4] = T.float32(0.0) -conv2d_nchw[5] = T.float32(0.0) -conv2d_nchw[6] = T.float32(0.0) -conv2d_nchw[7] = T.float32(0.0) -blockIdx_x = T.int32() -threadIdx_x_2 = T.int32() -for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x = T.env_thread("threadIdx.x") - pad_temp_shared = T.Buffer((4032,), scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 49] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 49) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 98] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 98) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 147] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 147) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 196) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 245] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 245) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 294] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 294) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 343] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 343) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 392] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 392) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 441] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 490] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 490) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 539] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 539) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 588] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 588) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 637) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 686] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 686) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 735] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 735) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 784] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 784) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 833] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 833) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 882] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 931] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 931) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 980] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 980) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1029] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1029) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1078) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1127] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1127) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1176] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1176) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1225] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1225) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1274] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1274) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1323] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1372] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1372) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1421] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1421) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1470] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1470) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1519) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1568] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1568) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1617] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1617) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1666] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1666) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1715] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1715) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1764] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1813] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1813) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1862] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1862) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1911] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1911) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 1960) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2009] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2009) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2058] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2058) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2107] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2107) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2156] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2156) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2205] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2254] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2254) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2303] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2303) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2352] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2352) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2401) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2450] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2450) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2499] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2499) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2548] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2548) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2597] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2597) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2646] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2695] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2695) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2744] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2744) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2793] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2793) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2842) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2891] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2891) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2940] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2940) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 2989] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 2989) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3038] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3038) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3087] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3136] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3136) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3185] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3185) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3234] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3234) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3283) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3332] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3332) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3381] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3381) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3430] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3430) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3479] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3479) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3528] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3577] = T.if_then_else(1 <= (threadIdx_x // 7 + 7) % 9 and (threadIdx_x // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3577) // 63 * 49 + (threadIdx_x // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3626] = T.if_then_else(1 <= (threadIdx_x // 7 + 5) % 9 and (threadIdx_x // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3626) // 63 * 49 + (threadIdx_x // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3675] = T.if_then_else(1 <= (threadIdx_x // 7 + 3) % 9 and (threadIdx_x // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3675) // 63 * 49 + (threadIdx_x // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3724) // 63 * 49 + threadIdx_x + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3773] = T.if_then_else(1 <= (threadIdx_x // 7 + 8) % 9 and (threadIdx_x // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3773) // 63 * 49 + (threadIdx_x // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3822] = T.if_then_else(1 <= (threadIdx_x // 7 + 6) % 9 and (threadIdx_x // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3822) // 63 * 49 + (threadIdx_x // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3871] = T.if_then_else(1 <= (threadIdx_x // 7 + 4) % 9 and (threadIdx_x // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3871) // 63 * 49 + (threadIdx_x // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3920] = T.if_then_else(threadIdx_x < 42 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 3920) // 63 * 49 + threadIdx_x + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - pad_temp_shared[threadIdx_x + 3969] = T.if_then_else(7 <= threadIdx_x and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x, 49): - if threadIdx_x < 14: - pad_temp_shared[threadIdx_x + 4018] = T.if_then_else(threadIdx_x < 7 and 1 <= rx_outer_outer + threadIdx_x % 7 and rx_outer_outer + threadIdx_x % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x + 41], T.float32(0.0)) - threadIdx_x_1 = T.env_thread("threadIdx.x") - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_1 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_1, 49): - kernel_shared[threadIdx_x_1 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 17: - kernel_shared[threadIdx_x_1 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_1 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_1 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 192] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 384] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 576] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 3] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 195] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 387] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 579] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 6] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 198] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 390] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 582] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 9] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 201] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 393] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 585] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 12] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 204] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 396] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 588] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 15] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 207] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 399] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 591] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 18] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 210] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 402] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 594] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 21] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 213] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 405] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 597] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 768] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 960] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1152] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2] * kernel_shared[rc_outer_inner * 24 + 1344] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 771] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 963] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1155] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 63] * kernel_shared[rc_outer_inner * 24 + 1347] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 774] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 966] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1158] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 126] * kernel_shared[rc_outer_inner * 24 + 1350] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 777] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 969] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1161] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 189] * kernel_shared[rc_outer_inner * 24 + 1353] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 780] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 972] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1164] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 252] * kernel_shared[rc_outer_inner * 24 + 1356] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 783] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 975] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1167] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 315] * kernel_shared[rc_outer_inner * 24 + 1359] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 786] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 978] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1170] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 378] * kernel_shared[rc_outer_inner * 24 + 1362] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 789] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 981] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1173] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 441] * kernel_shared[rc_outer_inner * 24 + 1365] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 193] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 385] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 577] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 4] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 196] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 388] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 580] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 7] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 199] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 391] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 583] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 10] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 202] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 394] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 586] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 13] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 205] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 397] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 589] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 16] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 208] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 400] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 592] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 19] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 211] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 403] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 595] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 22] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 214] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 406] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 598] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 769] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 961] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1153] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 7] * kernel_shared[rc_outer_inner * 24 + 1345] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 772] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 964] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1156] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 70] * kernel_shared[rc_outer_inner * 24 + 1348] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 775] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 967] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1159] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 133] * kernel_shared[rc_outer_inner * 24 + 1351] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 778] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 970] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1162] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 196] * kernel_shared[rc_outer_inner * 24 + 1354] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 781] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 973] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1165] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 259] * kernel_shared[rc_outer_inner * 24 + 1357] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 784] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 976] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1168] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 322] * kernel_shared[rc_outer_inner * 24 + 1360] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 787] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 979] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1171] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 385] * kernel_shared[rc_outer_inner * 24 + 1363] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 790] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 982] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1174] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 448] * kernel_shared[rc_outer_inner * 24 + 1366] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 2] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 194] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 386] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 578] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 5] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 197] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 389] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 581] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 8] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 200] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 392] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 584] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 11] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 203] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 395] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 587] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 14] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 206] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 398] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 590] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 17] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 209] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 401] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 593] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 20] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 212] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 404] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 596] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 23] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 215] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 407] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 599] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 770] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 962] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1154] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 14] * kernel_shared[rc_outer_inner * 24 + 1346] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 773] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 965] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1157] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 77] * kernel_shared[rc_outer_inner * 24 + 1349] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 776] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 968] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1160] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 140] * kernel_shared[rc_outer_inner * 24 + 1352] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 779] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 971] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1163] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 203] * kernel_shared[rc_outer_inner * 24 + 1355] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 782] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 974] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1166] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 266] * kernel_shared[rc_outer_inner * 24 + 1358] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 785] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 977] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1169] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 329] * kernel_shared[rc_outer_inner * 24 + 1361] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 788] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 980] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1172] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 392] * kernel_shared[rc_outer_inner * 24 + 1364] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 791] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 983] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1175] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x_2 + 455] * kernel_shared[rc_outer_inner * 24 + 1367] -for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x_2] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -with T.launch_thread("threadIdx.x", 49) as threadIdx_x: - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - conv2d_nchw[0] = T.float32(0.0) - conv2d_nchw[1] = T.float32(0.0) - conv2d_nchw[2] = T.float32(0.0) - conv2d_nchw[3] = T.float32(0.0) - conv2d_nchw[4] = T.float32(0.0) - conv2d_nchw[5] = T.float32(0.0) - conv2d_nchw[6] = T.float32(0.0) - conv2d_nchw[7] = T.float32(0.0) - blockIdx_x = T.int32() - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared = T.Buffer((4032,), scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 14: - pad_temp_shared[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared = T.Buffer((1536,), scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared[threadIdx_x_2 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_2, 49): - if threadIdx_x_2 < 17: - kernel_shared[threadIdx_x_2 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 192] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 384] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 576] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 3] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 195] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 387] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 579] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 6] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 198] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 390] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 582] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 9] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 201] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 393] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 585] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 12] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 204] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 396] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 588] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 15] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 207] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 399] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 591] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 18] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 210] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 402] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 594] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 21] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 213] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 405] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 597] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 768] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 960] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1152] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared[rc_outer_inner * 24 + 1344] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 771] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 963] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1155] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared[rc_outer_inner * 24 + 1347] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 774] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 966] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1158] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared[rc_outer_inner * 24 + 1350] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 777] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 969] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1161] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared[rc_outer_inner * 24 + 1353] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 780] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 972] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1164] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared[rc_outer_inner * 24 + 1356] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 783] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 975] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1167] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared[rc_outer_inner * 24 + 1359] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 786] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 978] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1170] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared[rc_outer_inner * 24 + 1362] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 789] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 981] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1173] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared[rc_outer_inner * 24 + 1365] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 193] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 385] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 577] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 4] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 196] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 388] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 580] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 7] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 199] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 391] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 583] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 10] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 202] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 394] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 586] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 13] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 205] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 397] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 589] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 16] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 208] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 400] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 592] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 19] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 211] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 403] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 595] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 22] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 214] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 406] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 598] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 769] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 961] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1153] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared[rc_outer_inner * 24 + 1345] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 772] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 964] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1156] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared[rc_outer_inner * 24 + 1348] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 775] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 967] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1159] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared[rc_outer_inner * 24 + 1351] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 778] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 970] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1162] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared[rc_outer_inner * 24 + 1354] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 781] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 973] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1165] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared[rc_outer_inner * 24 + 1357] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 784] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 976] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1168] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared[rc_outer_inner * 24 + 1360] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 787] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 979] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1171] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared[rc_outer_inner * 24 + 1363] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 790] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 982] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1174] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared[rc_outer_inner * 24 + 1366] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 2] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 194] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 386] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 578] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 5] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 197] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 389] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 581] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 8] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 200] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 392] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 584] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 11] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 203] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 395] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 587] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 14] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 206] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 398] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 590] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 17] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 209] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 401] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 593] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 20] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 212] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 404] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 596] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 23] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 215] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 407] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 599] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 770] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 962] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1154] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared[rc_outer_inner * 24 + 1346] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 773] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 965] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1157] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared[rc_outer_inner * 24 + 1349] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 776] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 968] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1160] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared[rc_outer_inner * 24 + 1352] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 779] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 971] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1163] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared[rc_outer_inner * 24 + 1355] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 782] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 974] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1166] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared[rc_outer_inner * 24 + 1358] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 785] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 977] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1169] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared[rc_outer_inner * 24 + 1361] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 788] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 980] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1172] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared[rc_outer_inner * 24 + 1364] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 791] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 983] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1175] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared[rc_outer_inner * 24 + 1367] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -T.bool(True) - -with T.allocate([1536], "float32", "shared") as kernel_shared: - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - conv2d_nchw[0] = T.float32(0.0) - conv2d_nchw[1] = T.float32(0.0) - conv2d_nchw[2] = T.float32(0.0) - conv2d_nchw[3] = T.float32(0.0) - conv2d_nchw[4] = T.float32(0.0) - conv2d_nchw[5] = T.float32(0.0) - conv2d_nchw[6] = T.float32(0.0) - conv2d_nchw[7] = T.float32(0.0) - blockIdx_x = T.int32() - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared = T.Buffer((4032,), scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 14: - pad_temp_shared[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_2, 49): - if threadIdx_x_2 < 17: - kernel_shared_1[threadIdx_x_2 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 192] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 384] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 576] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 3] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 195] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 387] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 579] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 6] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 198] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 390] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 582] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 9] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 201] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 393] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 585] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 12] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 204] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 396] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 588] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 15] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 207] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 399] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 591] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 18] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 210] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 402] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 594] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 21] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 213] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 405] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 597] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 768] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 960] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1152] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1344] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 771] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 963] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1155] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1347] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 774] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 966] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1158] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1350] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 777] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 969] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1161] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1353] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 780] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 972] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1164] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1356] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 783] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 975] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1167] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1359] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 786] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 978] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1170] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1362] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 789] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 981] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1173] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1365] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 193] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 385] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 577] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 4] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 196] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 388] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 580] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 7] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 199] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 391] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 583] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 10] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 202] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 394] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 586] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 13] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 205] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 397] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 589] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 16] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 208] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 400] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 592] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 19] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 211] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 403] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 595] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 22] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 214] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 406] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 598] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 769] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 961] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1153] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1345] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 772] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 964] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1156] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1348] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 775] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 967] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1159] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1351] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 778] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 970] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1162] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1354] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 781] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 973] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1165] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1357] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 784] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 976] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1168] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1360] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 787] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 979] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1171] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1363] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 790] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 982] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1174] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1366] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 2] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 194] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 386] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 578] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 5] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 197] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 389] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 581] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 8] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 200] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 392] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 584] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 11] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 203] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 395] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 587] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 14] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 206] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 398] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 590] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 17] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 209] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 401] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 593] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 20] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 212] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 404] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 596] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 23] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 215] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 407] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 599] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 770] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 962] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1154] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1346] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 773] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 965] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1157] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1349] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 776] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 968] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1160] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1352] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 779] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 971] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1163] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1355] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 782] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 974] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1166] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1358] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 785] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 977] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1169] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1361] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 788] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 980] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1172] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1364] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 791] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 983] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1175] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1367] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -T.bool(True) - -with T.allocate([4032], "float32", "shared") as pad_temp_shared: - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw = T.Buffer((8,), scope="local", align=32) - conv2d_nchw[0] = T.float32(0.0) - conv2d_nchw[1] = T.float32(0.0) - conv2d_nchw[2] = T.float32(0.0) - conv2d_nchw[3] = T.float32(0.0) - conv2d_nchw[4] = T.float32(0.0) - conv2d_nchw[5] = T.float32(0.0) - conv2d_nchw[6] = T.float32(0.0) - conv2d_nchw[7] = T.float32(0.0) - blockIdx_x = T.int32() - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 14: - pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_2, 49): - if threadIdx_x_2 < 17: - kernel_shared_1[threadIdx_x_2 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 192] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 384] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 576] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 3] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 195] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 387] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 579] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 6] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 198] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 390] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 582] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 9] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 201] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 393] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 585] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 12] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 204] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 396] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 588] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 15] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 207] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 399] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 591] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 18] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 210] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 402] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 594] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 21] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 213] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 405] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 597] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 768] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 960] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1152] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1344] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 771] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 963] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1155] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1347] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 774] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 966] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1158] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1350] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 777] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 969] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1161] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1353] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 780] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 972] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1164] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1356] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 783] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 975] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1167] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1359] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 786] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 978] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1170] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1362] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 789] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 981] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1173] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1365] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 193] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 385] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 577] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 4] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 196] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 388] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 580] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 7] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 199] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 391] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 583] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 10] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 202] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 394] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 586] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 13] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 205] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 397] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 589] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 16] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 208] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 400] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 592] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 19] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 211] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 403] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 595] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 22] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 214] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 406] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 598] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 769] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 961] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1153] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1345] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 772] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 964] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1156] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1348] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 775] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 967] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1159] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1351] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 778] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 970] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1162] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1354] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 781] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 973] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1165] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1357] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 784] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 976] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1168] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1360] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 787] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 979] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1171] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1363] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 790] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 982] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1174] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1366] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 2] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 194] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 386] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 578] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 5] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 197] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 389] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 581] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 8] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 200] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 392] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 584] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 11] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 203] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 395] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 587] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 14] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 206] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 398] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 590] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 17] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 209] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 401] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 593] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 20] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 212] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 404] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 596] - conv2d_nchw[0] = conv2d_nchw[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 23] - conv2d_nchw[1] = conv2d_nchw[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 215] - conv2d_nchw[2] = conv2d_nchw[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 407] - conv2d_nchw[3] = conv2d_nchw[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 599] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 770] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 962] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1154] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1346] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 773] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 965] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1157] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1349] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 776] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 968] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1160] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1352] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 779] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 971] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1163] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1355] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 782] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 974] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1166] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1358] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 785] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 977] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1169] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1361] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 788] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 980] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1172] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1364] - conv2d_nchw[4] = conv2d_nchw[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 791] - conv2d_nchw[5] = conv2d_nchw[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 983] - conv2d_nchw[6] = conv2d_nchw[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1175] - conv2d_nchw[7] = conv2d_nchw[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1367] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -T.bool(True) - -with T.allocate([8], "float32", "local") as conv2d_nchw: - pad_temp_shared = T.allocate([4032], "float32", "shared") - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - conv2d_nchw_1[0] = T.float32(0.0) - conv2d_nchw_1[1] = T.float32(0.0) - conv2d_nchw_1[2] = T.float32(0.0) - conv2d_nchw_1[3] = T.float32(0.0) - conv2d_nchw_1[4] = T.float32(0.0) - conv2d_nchw_1[5] = T.float32(0.0) - conv2d_nchw_1[6] = T.float32(0.0) - conv2d_nchw_1[7] = T.float32(0.0) - blockIdx_x = T.int32() - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 14: - pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_2, 49): - if threadIdx_x_2 < 17: - kernel_shared_1[threadIdx_x_2 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 192] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 384] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 576] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 195] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 387] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 579] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 6] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 198] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 390] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 582] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 9] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 201] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 393] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 585] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 12] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 204] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 396] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 588] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 15] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 207] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 399] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 591] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 18] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 210] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 402] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 594] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 21] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 213] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 405] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 597] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 768] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 960] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1152] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1344] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 771] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 963] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1155] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1347] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 774] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 966] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1158] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1350] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 777] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 969] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1161] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1353] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 780] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 972] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1164] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1356] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 783] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 975] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1167] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1359] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 786] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 978] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1170] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1362] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 789] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 981] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1173] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1365] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 193] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 385] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 577] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 4] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 196] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 388] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 580] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 7] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 199] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 391] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 583] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 10] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 202] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 394] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 586] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 13] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 205] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 397] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 589] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 16] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 208] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 400] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 592] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 19] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 211] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 403] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 595] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 22] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 214] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 406] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 598] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 769] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 961] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1153] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1345] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 772] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 964] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1156] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1348] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 775] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 967] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1159] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1351] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 778] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 970] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1162] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1354] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 781] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 973] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1165] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1357] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 784] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 976] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1168] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1360] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 787] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 979] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1171] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1363] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 790] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 982] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1174] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1366] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 2] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 194] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 386] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 578] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 5] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 197] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 389] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 581] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 8] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 200] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 392] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 584] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 11] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 203] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 395] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 587] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 14] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 206] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 398] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 590] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 17] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 209] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 401] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 593] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 20] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 212] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 404] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 596] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 23] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 215] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 407] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 599] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 770] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 962] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1154] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1346] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 773] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 965] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1157] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1349] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 776] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 968] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1160] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1352] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 779] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 971] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1163] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1355] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 782] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 974] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1166] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1358] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 785] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 977] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1169] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1361] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 788] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 980] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1172] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1364] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 791] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 983] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1175] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1367] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -with T.launch_thread("blockIdx.x", 64) as blockIdx_x: - conv2d_nchw = T.allocate([8], "float32", "local") - pad_temp_shared = T.allocate([4032], "float32", "shared") - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - conv2d_nchw_1[0] = T.float32(0.0) - conv2d_nchw_1[1] = T.float32(0.0) - conv2d_nchw_1[2] = T.float32(0.0) - conv2d_nchw_1[3] = T.float32(0.0) - conv2d_nchw_1[4] = T.float32(0.0) - conv2d_nchw_1[5] = T.float32(0.0) - conv2d_nchw_1[6] = T.float32(0.0) - conv2d_nchw_1[7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data = T.Buffer((25088,)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 14: - pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data[rc_outer_outer * 3136 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel = T.Buffer((2359296,)) - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 49] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 98] = kernel[blockIdx_x * 36864 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 147] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 196] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 245] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 294] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 343] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 392] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 441] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 490] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 539] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 588] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 637] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 686] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 735] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 784] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 833] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 882] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 931] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 980] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1029] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1078] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1127] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1176] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1225] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1274] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1323] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + rc_outer_outer * 576 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1372] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1421] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1470] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_2, 49): - if threadIdx_x_2 < 17: - kernel_shared_1[threadIdx_x_2 + 1519] = kernel[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + rc_outer_outer * 576 + threadIdx_x_2 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 192] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 384] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 576] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 195] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 387] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 579] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 6] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 198] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 390] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 582] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 9] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 201] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 393] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 585] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 12] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 204] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 396] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 588] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 15] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 207] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 399] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 591] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 18] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 210] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 402] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 594] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 21] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 213] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 405] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 597] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 768] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 960] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1152] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[rc_outer_inner * 24 + 1344] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 771] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 963] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1155] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[rc_outer_inner * 24 + 1347] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 774] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 966] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1158] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[rc_outer_inner * 24 + 1350] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 777] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 969] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1161] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[rc_outer_inner * 24 + 1353] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 780] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 972] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1164] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[rc_outer_inner * 24 + 1356] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 783] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 975] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1167] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[rc_outer_inner * 24 + 1359] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 786] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 978] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1170] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[rc_outer_inner * 24 + 1362] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 789] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 981] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1173] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[rc_outer_inner * 24 + 1365] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 193] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 385] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 577] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 4] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 196] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 388] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 580] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 7] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 199] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 391] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 583] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 10] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 202] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 394] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 586] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 13] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 205] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 397] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 589] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 16] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 208] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 400] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 592] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 19] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 211] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 403] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 595] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 22] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 214] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 406] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 598] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 769] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 961] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1153] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[rc_outer_inner * 24 + 1345] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 772] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 964] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1156] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[rc_outer_inner * 24 + 1348] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 775] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 967] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1159] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[rc_outer_inner * 24 + 1351] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 778] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 970] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1162] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[rc_outer_inner * 24 + 1354] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 781] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 973] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1165] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[rc_outer_inner * 24 + 1357] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 784] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 976] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1168] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[rc_outer_inner * 24 + 1360] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 787] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 979] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1171] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[rc_outer_inner * 24 + 1363] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 790] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 982] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1174] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[rc_outer_inner * 24 + 1366] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 2] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 194] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 386] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 578] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 5] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 197] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 389] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 581] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 8] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 200] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 392] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 584] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 11] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 203] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 395] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 587] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 14] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 206] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 398] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 590] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 17] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 209] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 401] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 593] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 20] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 212] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 404] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 596] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 23] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 215] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 407] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 599] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 770] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 962] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1154] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[rc_outer_inner * 24 + 1346] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 773] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 965] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1157] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[rc_outer_inner * 24 + 1349] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 776] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 968] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1160] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[rc_outer_inner * 24 + 1352] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 779] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 971] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1163] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[rc_outer_inner * 24 + 1355] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 782] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 974] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1166] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[rc_outer_inner * 24 + 1358] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 785] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 977] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1169] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[rc_outer_inner * 24 + 1361] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 788] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 980] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1172] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[rc_outer_inner * 24 + 1364] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 791] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 983] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1175] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[rc_outer_inner * 24 + 1367] - for i1_inner in range(8): - compute = T.Buffer((25088,)) - bias = T.Buffer((512,)) - compute[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -# from tvm.script import ir as I -# from tvm.script import tir as T - -@I.ir_module -class Module: - @T.prim_func - def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")): - T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)}) - blockIdx_x = T.launch_thread("blockIdx.x", 64) - conv2d_nchw = T.allocate([8], "float32", "local") - pad_temp_shared = T.allocate([4032], "float32", "shared") - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - conv2d_nchw_1[0] = T.float32(0.0) - conv2d_nchw_1[1] = T.float32(0.0) - conv2d_nchw_1[2] = T.float32(0.0) - conv2d_nchw_1[3] = T.float32(0.0) - conv2d_nchw_1[4] = T.float32(0.0) - conv2d_nchw_1[5] = T.float32(0.0) - conv2d_nchw_1[6] = T.float32(0.0) - conv2d_nchw_1[7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - cse_var_2: T.int32 = rc_outer_outer * 3136 - cse_var_1: T.int32 = rc_outer_outer * 576 - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data_1 = T.Buffer((25088,), data=data.data) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if threadIdx_x_1 < 14: - pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel_1 = T.Buffer((2359296,), data=kernel.data) - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 49] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 98] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 147] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 196] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 245] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 294] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 343] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 441] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 490] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 539] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 588] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 637] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 686] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 735] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 833] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 882] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 931] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 980] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1029] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1078] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1127] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1176] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1225] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1274] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1323] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1372] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1421] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1470] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_2, 49): - if threadIdx_x_2 < 17: - kernel_shared_1[threadIdx_x_2 + 1519] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - cse_var_3: T.int32 = rc_outer_inner * 24 - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 192] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 384] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 576] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 195] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 387] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 579] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 6] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 198] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 390] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 582] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 9] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 201] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 393] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 585] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 12] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 204] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 396] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 588] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 15] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 207] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 399] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 591] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 18] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 210] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 402] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 594] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 21] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 213] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 405] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 597] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 768] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 960] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1152] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1344] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 771] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 963] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1155] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1347] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 774] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 966] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1158] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1350] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 777] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 969] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1161] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1353] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 780] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 972] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1164] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1356] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 783] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 975] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1167] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1359] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 786] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 978] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1170] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1362] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 789] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 981] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1173] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1365] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 193] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 385] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 577] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 4] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 196] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 388] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 580] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 7] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 199] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 391] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 583] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 10] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 202] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 394] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 586] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 13] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 205] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 397] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 589] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 16] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 208] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 400] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 592] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 19] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 211] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 403] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 595] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 22] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 214] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 406] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 598] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 769] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 961] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1153] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1345] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 772] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 964] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1156] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1348] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 775] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 967] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1159] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1351] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 778] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 970] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1162] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1354] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 781] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 973] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1165] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1357] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 784] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 976] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1168] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1360] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 787] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 979] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1171] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1363] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 790] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 982] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1174] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1366] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 2] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 194] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 386] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 578] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 5] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 197] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 389] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 581] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 8] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 200] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 392] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 584] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 11] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 203] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 395] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 587] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 14] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 206] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 398] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 590] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 17] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 209] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 401] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 593] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 20] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 212] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 404] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 596] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 23] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 215] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 407] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 599] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 770] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 962] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1154] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1346] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 773] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 965] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1157] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1349] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 776] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 968] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1160] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1352] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 779] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 971] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1163] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1355] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 782] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 974] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1166] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1358] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 785] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 977] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1169] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1361] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 788] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 980] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1172] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1364] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 791] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 983] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1175] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1367] - for i1_inner in range(8): - compute_1 = T.Buffer((25088,), data=compute.data) - bias_1 = T.Buffer((512,), data=bias.data) - compute_1[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 8 + i1_inner], T.float32(0.0)) -Execution time of this operator: 0.101 ms -Equivalent python schedule: -pad_temp_i0, pad_temp_i1, pad_temp_i2, pad_temp_i3 = tuple(pad_temp.op.axis) + tuple(pad_temp.op.reduce_axis) -conv2d_nchw_nn, conv2d_nchw_ff, conv2d_nchw_yy, conv2d_nchw_xx, conv2d_nchw_rc, conv2d_nchw_ry, conv2d_nchw_rx = tuple(conv2d_nchw.op.axis) + tuple(conv2d_nchw.op.reduce_axis) -T_add_ax0, T_add_ax1, T_add_ax2, T_add_ax3 = tuple(T_add.op.axis) + tuple(T_add.op.reduce_axis) -compute_i0, compute_i1, compute_i2, compute_i3 = tuple(compute.op.axis) + tuple(compute.op.reduce_axis) -s[T_add].compute_inline() -conv2d_nchw_nn_o_i, conv2d_nchw_nn_i = s[conv2d_nchw].split(conv2d_nchw_nn, factor=1) -conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1) -conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1) -conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1) -conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=4) -conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2) -conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=1) -conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1) -conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1) -conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1) -conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7) -conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1) -conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1) -conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1) -conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7) -conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1) -conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=8) -conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8) -conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1) -conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3) -conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1) -conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1) -s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2d_nchw_xx_o_i, conv2d_nchw_rc_i, conv2d_nchw_ry_i, conv2d_nchw_rx_i, conv2d_nchw_nn_i, conv2d_nchw_ff_i, conv2d_nchw_yy_i, conv2d_nchw_xx_i) -compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1) -compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1) -compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1) -compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=8) -compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=1) -compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1) -compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1) -compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7) -compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1) -compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1) -compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7) -compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1) -s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i) -s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i) -kernel_shared = s.cache_read(kernel, "shared", [conv2d_nchw]) -kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3 = tuple(kernel_shared.op.axis) -s[kernel_shared].compute_at(s[conv2d_nchw], conv2d_nchw_rx_o_o) -pad_temp_shared = s.cache_read(pad_temp, "shared", [conv2d_nchw]) -pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3 = tuple(pad_temp_shared.op.axis) -s[pad_temp_shared].compute_at(s[conv2d_nchw], conv2d_nchw_rx_o_o) -s[pad_temp].compute_inline() -compute_i0_o_o_o_i1_o_o_o_fused_i2_o_o_o_fused_i3_o_o_o_fused = s[compute].fuse(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o) -s[compute].bind(compute_i0_o_o_o_i1_o_o_o_fused_i2_o_o_o_fused_i3_o_o_o_fused, te.thread_axis("blockIdx.x")) -compute_i0_o_o_i_i1_o_o_i_fused_i2_o_o_i_fused_i3_o_o_i_fused = s[compute].fuse(compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i) -s[compute].bind(compute_i0_o_o_i_i1_o_o_i_fused_i2_o_o_i_fused_i3_o_o_i_fused, te.thread_axis("vthread")) -compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i) -s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis("threadIdx.x")) -kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3) -kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1) -s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i) -kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=49) -s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x")) -pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3) -pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1) -s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i) -pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=49) -s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x")) -s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 1024) -s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True) - diff --git a/llpass.py b/llpass.py deleted file mode 100644 index 35283421970a..000000000000 --- a/llpass.py +++ /dev/null @@ -1,162 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Writing a Customized Pass -========================= -**Author**: `Jian Weng `_ - -TVM is a framework that abstracts away the heterogenity of machine learning accelerators. -Sometimes users may want customize some analysis and IR transformations -to adapt TVM to their own specialized hardware. This tutorial helps users write -a customized pass in TVM. - -Prerequisites -------------- - -Before reading this tutorial, we assume readers have already known these topics well: - -- Writing an algorithm in TVM and schedule it. Otherwise, see example tutorials like - :ref:`opt-gemm`. -- The basic structure of HalideIR. Otherwise, see ``HalideIR/src/ir/IR.h`` to learn what - attributes of IR nodes are defined. -- Visitor design pattern. Otherwise, check the - `Python AST module `_ to see how an AST - visitor is implemented. -- How a Schedule is lowered to either an IRModule class or a LLVM module. Otherwise, - take a look at ``python/tvm/build_module.py`` to get some basics. - -""" - -import tvm -from tvm import te -import numpy as np - -###################################################################### -# We first write a very simple vector add and build it with the default schedule. Then, we use -# our customized lowering pass to manipulate the IR directly instead of using schedule primitives. -# - -n = tvm.tir.const(128, "int32") -a = te.placeholder((n,), name="a") -b = te.placeholder((n,), name="b") -c = te.compute((n,), lambda i: a[i] + b[i], name="c") - -sch = te.create_schedule(c.op) -ir = tvm.lower(sch, [a, b, c]) -print(ir) - -###################################################################### -# Writing a Pass -# -------------- -# Essentially, an "IR transformation pass" is a function which maps a statement to a new statement. -# Thus, we define this vectorize function and implement it step by step. -# - -###################################################################### -# TVM already provides two class for users to both analyze and transform IR. -# -# IR Visitor -# ~~~~~~~~~~ -# We can use ``tvm.tir.stmt_functor.post_order_visit(stmt, func)`` to gather information from the Halide IR. -# ``func`` is a function callback. This function will be called before exiting the current IR node, -# i.e. post-order visit. Then we leverage side effects to store the result of IR visit, because the -# return value of ``func`` will be ignored. -# -# .. note:: -# -# You MUST use some array to store the result of IR visit. Even the value is a single variable. -# This is mainly due to the constraints in the Python-C runtime. The variable values will be -# refreshed every recursion but the array values will be preserved. -# - - -def find_width8(op): - - print(op) - print(type(op)) - -##################################################################### -# IR Transformation -# ~~~~~~~~~~~~~~~~~ -# The transformation interface is slightly different from the visitor interface. There is only a -# post-order callback in the visitor, but transformation visitor supports both a pre-order and a -# post-order callback. If you want to keep the origin IR node, just return None. If you want to -# change the current node to some node, use TVM IR maker interface to build it and return -# this value. -# -# .. note:: -# -# If the pre-order function is called and returns a value which is not None, the post-order -# function will be skipped. -# - - -def vectorize8(op): - """Split can vectorize the loops found in `find_width8`.""" - if op in loops: - extent = op.extent.value - name = op.loop_var.name - lo, li = te.var(name + ".outer"), te.var(name + ".inner") - body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li}) - body = tvm.tir.For(li, 0, 8, tvm.tir.ForKind.VECTORIZED, body) - body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.ForKind.SERIAL, body) - return body - return None - - -@tvm.tir.transform.prim_func_pass(opt_level=0) -def vectorize(f, mod, ctx): - tvm.tir.stmt_functor.post_order_visit(f.body, find_width8) - return f - - -##################################################################### -# Glue to Lowering -# ---------------- -# So far, we are done with writing this IR transformation pass. What we need to do next is to glue -# this pass to TVM's lower pass. -# -# In this case, we inject the pass written above into the TVM standard lowering -# pass by feeding **a list of tuple** as argument to ``tir.add_lower_pass``. "Tuple" indicates different -# phases of lowering. In TVM, there are four phases of lowering and user-customized ones will be -# called after each phase is done. -# -# .. note:: -# Here are the essential transformations done by each phase: -# - Phase 0 generates the raw IR and loop levels. -# - Phase 1 flattens the array storage. -# - Phase 2 transforms loops, like unroll, vectorization and thread-binding. -# - Phase 3 does some cleanup work. -# -# Thus, a good place to put this transformation pass is just after Phase 1. -# - -for i in range(4): - print(f"Phase {i}") - print("-" * 20) - with tvm.transform.PassContext(config={"tir.add_lower_pass": [(i, vectorize)]}): - print(tvm.lower(sch, [a, b, c])) - -##################################################################### -# Quick View -# ---------- -# This tutorial gives a quick view of writing a customized IR transformation pass: -# - Use ``tvm.tir.stmt_functor.post_order_visit`` to gather information on each IR nodes. -# - Use ``tvm.tir.stmt_functor.ir_transform`` to transform IR nodes. -# - Wrap up two above to write an IR-transformation function. -# - Use ``tvm.transform.PassContext`` to put this function to TVM lowering pass -# \ No newline at end of file diff --git a/lowered_tir.py b/lowered_tir.py deleted file mode 100644 index e26cd3a2dab2..000000000000 --- a/lowered_tir.py +++ /dev/null @@ -1,481 +0,0 @@ -from tvm.script import ir as I -from tvm.script import tir as T - -@I.ir_module -class Module: - @T.prim_func - def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")): - T.func_attr({"from_legacy_te_schedule": T.bool(True), "tir.noalias": T.bool(True)}) - blockIdx_x = T.launch_thread("blockIdx.x", 64) - conv2d_nchw = T.allocate([8], "float32", "local") - pad_temp_shared = T.allocate([4032], "float32", "shared") - kernel_shared = T.allocate([1536], "float32", "shared") - threadIdx_x = T.launch_thread("threadIdx.x", 49) - conv2d_nchw_1 = T.Buffer((8,), data=conv2d_nchw, scope="local", align=32) - conv2d_nchw_1[0] = T.float32(0.0) - conv2d_nchw_1[1] = T.float32(0.0) - conv2d_nchw_1[2] = T.float32(0.0) - conv2d_nchw_1[3] = T.float32(0.0) - conv2d_nchw_1[4] = T.float32(0.0) - conv2d_nchw_1[5] = T.float32(0.0) - conv2d_nchw_1[6] = T.float32(0.0) - conv2d_nchw_1[7] = T.float32(0.0) - for rc_outer_outer, rx_outer_outer in T.grid(8, 3): - cse_var_2: T.int32 = rc_outer_outer * 3136 - cse_var_1: T.int32 = rc_outer_outer * 576 - threadIdx_x_1 = T.env_thread("threadIdx.x") - pad_temp_shared_1 = T.Buffer((4032,), data=pad_temp_shared, scope="shared") - data_1 = T.Buffer((25088,), data=data.data) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 49] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 49) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 98] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 98) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 147] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 147) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 196] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 196) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 245] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 245) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 294] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 294) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 343] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 343) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 392] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 392) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 441] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 335], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 490] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 490) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 539] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 539) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 588] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 588) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 637] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 637) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 686] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 686) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 735] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 735) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 784) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 833] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 833) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 882] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 678], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 931] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 931) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 980] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 980) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1029] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1029) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1078] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1078) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1127] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1127) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1176] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1176) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1225] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1225) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1274] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1274) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1323] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1021], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1372] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1372) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1421] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1421) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1470] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1470) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1519] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1519) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1568) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1617] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1617) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1666] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1666) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1715] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1715) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1764] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1364], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1813] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1813) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1862] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1862) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1911] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1911) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 1960] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 1960) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2009] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2009) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2058] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2058) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2107] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2107) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2156] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2156) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2205] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 1707], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2254] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2254) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2303] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2303) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2352) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2401] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2401) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2450] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2450) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2499] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2499) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2548] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2548) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2597] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2597) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2646] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2050], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2695] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2695) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2744] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2744) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2793] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2793) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2842] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2842) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2891] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2891) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2940] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2940) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 2989] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 2989) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3038] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3038) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3087] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2393], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3136] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3136) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3185] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3185) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3234] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3234) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3283] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3283) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3332] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3332) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3381] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3381) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3430] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3430) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3479] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3479) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3528] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 2736], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3577] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 7) % 9 and (threadIdx_x_1 // 7 + 7) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3577) // 63 * 49 + (threadIdx_x_1 // 7 + 7) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3626] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 5) % 9 and (threadIdx_x_1 // 7 + 5) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3626) // 63 * 49 + (threadIdx_x_1 // 7 + 5) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3675] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 3) % 9 and (threadIdx_x_1 // 7 + 3) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3675) // 63 * 49 + (threadIdx_x_1 // 7 + 3) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3724] = T.if_then_else(1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3724) // 63 * 49 + threadIdx_x_1 + rx_outer_outer - 1], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3773] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 8) % 9 and (threadIdx_x_1 // 7 + 8) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3773) // 63 * 49 + (threadIdx_x_1 // 7 + 8) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3822] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 6) % 9 and (threadIdx_x_1 // 7 + 6) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3822) // 63 * 49 + (threadIdx_x_1 // 7 + 6) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3871] = T.if_then_else(1 <= (threadIdx_x_1 // 7 + 4) % 9 and (threadIdx_x_1 // 7 + 4) % 9 < 8 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3871) // 63 * 49 + (threadIdx_x_1 // 7 + 4) % 9 * 7 + rx_outer_outer + threadIdx_x_1 % 7 - 8], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3920] = T.if_then_else(threadIdx_x_1 < 42 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 3920) // 63 * 49 + threadIdx_x_1 + rx_outer_outer + 6], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - pad_temp_shared_1[threadIdx_x_1 + 3969] = T.if_then_else(7 <= threadIdx_x_1 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + threadIdx_x_1 + rx_outer_outer + 3079], T.float32(0.0)) - with T.launch_thread(threadIdx_x_1, 49): - if T.likely(threadIdx_x_1 < 14): - pad_temp_shared_1[threadIdx_x_1 + 4018] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= rx_outer_outer + threadIdx_x_1 % 7 and rx_outer_outer + threadIdx_x_1 % 7 < 8, data_1[cse_var_2 + (threadIdx_x_1 + 4018) // 63 * 49 + rx_outer_outer + threadIdx_x_1 + 41], T.float32(0.0)) - threadIdx_x_2 = T.env_thread("threadIdx.x") - kernel_shared_1 = T.Buffer((1536,), data=kernel_shared, scope="shared") - kernel_1 = T.Buffer((2359296,), data=kernel.data) - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 49] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 147] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 98] = kernel_1[blockIdx_x * 36864 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 294] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 147] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 147) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 147) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 196] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 196) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 12] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 245] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 245) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 159] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 294] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 294) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 306] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 343] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 343) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 151) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 392) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 24] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 441] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 441) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 171] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 490] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 490) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 318] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 539] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 539) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 155) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 588] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 588) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 36] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 637] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 637) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 183] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 686] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 686) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 330] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 735] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 735) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 159) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 784) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 48] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 833] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 833) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 195] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 882] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 882) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 342] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 931] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 931) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 163) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 980] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 980) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 60] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1029] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1029) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 207] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1078] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1078) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 354] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1127] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1127) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 167) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1176] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1176) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 72] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1225] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1225) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 219] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1274] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1274) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 366] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1323] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1323) // 192 * 4608 + cse_var_1 + (threadIdx_x_2 + 171) % 192 * 3 + rx_outer_outer] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1372] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1372) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 84] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1421] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1421) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 231] - with T.launch_thread(threadIdx_x_2, 49): - kernel_shared_1[threadIdx_x_2 + 1470] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1470) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 378] - with T.launch_thread(threadIdx_x_2, 49): - if T.likely(threadIdx_x_2 < 17): - kernel_shared_1[threadIdx_x_2 + 1519] = kernel_1[blockIdx_x * 36864 + (threadIdx_x_2 + 1519) // 192 * 4608 + cse_var_1 + threadIdx_x_2 * 3 + rx_outer_outer + 525] - for rc_outer_inner in range(8): - cse_var_3: T.int32 = rc_outer_inner * 24 - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 192] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 384] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 576] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 3] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 195] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 387] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 579] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 6] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 198] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 390] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 582] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 9] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 201] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 393] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 585] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 12] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 204] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 396] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 588] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 15] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 207] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 399] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 591] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 18] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 210] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 402] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 594] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 21] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 213] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 405] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 597] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 768] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 960] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1152] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x] * kernel_shared_1[cse_var_3 + 1344] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 771] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 963] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1155] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 63] * kernel_shared_1[cse_var_3 + 1347] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 774] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 966] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1158] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 126] * kernel_shared_1[cse_var_3 + 1350] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 777] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 969] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1161] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 189] * kernel_shared_1[cse_var_3 + 1353] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 780] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 972] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1164] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 252] * kernel_shared_1[cse_var_3 + 1356] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 783] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 975] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1167] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 315] * kernel_shared_1[cse_var_3 + 1359] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 786] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 978] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1170] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 378] * kernel_shared_1[cse_var_3 + 1362] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 789] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 981] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1173] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 441] * kernel_shared_1[cse_var_3 + 1365] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 193] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 385] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 577] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 4] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 196] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 388] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 580] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 7] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 199] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 391] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 583] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 10] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 202] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 394] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 586] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 13] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 205] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 397] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 589] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 16] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 208] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 400] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 592] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 19] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 211] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 403] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 595] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 22] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 214] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 406] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 598] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 769] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 961] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1153] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 7] * kernel_shared_1[cse_var_3 + 1345] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 772] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 964] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1156] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 70] * kernel_shared_1[cse_var_3 + 1348] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 775] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 967] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1159] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 133] * kernel_shared_1[cse_var_3 + 1351] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 778] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 970] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1162] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 196] * kernel_shared_1[cse_var_3 + 1354] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 781] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 973] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1165] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 259] * kernel_shared_1[cse_var_3 + 1357] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 784] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 976] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1168] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 322] * kernel_shared_1[cse_var_3 + 1360] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 787] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 979] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1171] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 385] * kernel_shared_1[cse_var_3 + 1363] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 790] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 982] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1174] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 448] * kernel_shared_1[cse_var_3 + 1366] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 2] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 194] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 386] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 578] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 5] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 197] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 389] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 581] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 8] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 200] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 392] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 584] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 11] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 203] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 395] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 587] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 14] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 206] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 398] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 590] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 17] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 209] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 401] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 593] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 20] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 212] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 404] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 596] - conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 23] - conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 215] - conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 407] - conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 599] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 770] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 962] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1154] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 14] * kernel_shared_1[cse_var_3 + 1346] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 773] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 965] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1157] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 77] * kernel_shared_1[cse_var_3 + 1349] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 776] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 968] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1160] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 140] * kernel_shared_1[cse_var_3 + 1352] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 779] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 971] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1163] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 203] * kernel_shared_1[cse_var_3 + 1355] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 782] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 974] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1166] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 266] * kernel_shared_1[cse_var_3 + 1358] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 785] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 977] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1169] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 329] * kernel_shared_1[cse_var_3 + 1361] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 788] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 980] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1172] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 392] * kernel_shared_1[cse_var_3 + 1364] - conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 791] - conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 983] - conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1175] - conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 504 + threadIdx_x + 455] * kernel_shared_1[cse_var_3 + 1367] - for i1_inner in range(8): - compute_1 = T.Buffer((25088,), data=compute.data) - bias_1 = T.Buffer((512,), data=bias.data) - compute_1[blockIdx_x * 392 + i1_inner * 49 + threadIdx_x] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 8 + i1_inner], T.float32(0.0)) - -import tvm -from tvm import te, tir, IRModule - -mod: IRModule = Module -print(mod.script()) - -# get stmt -from tvm.tir.stmt_functor import ir_transform, post_order_visit -stmt = mod["main"] -print(stmt) - -# use post_order_visit to get all the stmt -# make a function to visit the stmt -def visit_stmt(stmt): - print("visit stmt") - print(type(stmt)) - - -tvm.tir.round() \ No newline at end of file From 2c3582dda1874b94478f50fe26ab545482006ed2 Mon Sep 17 00:00:00 2001 From: Dwijen Chawra Date: Thu, 5 Dec 2024 00:34:26 -0500 Subject: [PATCH 8/8] updates --- .../auto_scheduler/cost_model/gnn_model.py | 102 ++++++++++++++++-- 1 file changed, 92 insertions(+), 10 deletions(-) diff --git a/python/tvm/auto_scheduler/cost_model/gnn_model.py b/python/tvm/auto_scheduler/cost_model/gnn_model.py index cb256d3f0f5f..c44e56678796 100644 --- a/python/tvm/auto_scheduler/cost_model/gnn_model.py +++ b/python/tvm/auto_scheduler/cost_model/gnn_model.py @@ -35,6 +35,7 @@ from ..measure_record import RecordReader # from ..search_task import SearchTask import tvm.te as te +import tvm.tir as tir import tvm import networkx as nx import matplotlib.pyplot as plt @@ -44,7 +45,11 @@ import uuid from ..measure import MeasureInput +from ...ir.supply import GlobalVarSupply from pyvis.network import Network +import inspect + +from tqdm import tqdm try: from xgboost.callback import TrainingCallback # type: ignore @@ -105,7 +110,7 @@ def postorder(node): # Return None to continue postorder processing return None - @tvm.tir.transform.prim_func_pass(opt_level=3) + @tvm.tir.transform.prim_func_pass(opt_level=1) def ast_extractor(f, mod, ctx): # clear the graph graph.clear() @@ -117,16 +122,79 @@ def ast_extractor(f, mod, ctx): parent_stack.append("root") tvm.tir.stmt_functor.ir_transform(f.body, preorder, postorder) + print(graph) + + print("types:", len(types), types) return f # apply the state transformations schedule, args = task.compute_dag.apply_steps_from_state(state) schedule: te.Schedule - with tvm.transform.PassContext(config={"tir.add_lower_pass": [(3, ast_extractor)]}): + ctx = tvm.transform.PassContext(opt_level=1, config={ + "tir.disable_vectorize": False, + "tir.instrument_bound_checkers": False, + "tir.add_lower_pass": [ + (0, tvm.tir.transform.InjectPrefetch()), + (0, tvm.tir.transform.StorageFlatten(64, False)), + (1, tvm.tir.transform.NarrowDataType(32)), + (1, tvm.tir.transform.Simplify()), + (1, tvm.tir.transform.VectorizeLoop(False)), + (1, tvm.tir.transform.InjectVirtualThread()), + (1, tvm.tir.transform.StorageRewrite()), + (1, tvm.tir.transform.Simplify()), # skipped verifygpucode + (1, ast_extractor) + ] + }) + + with ctx: mod: tvm.ir.module.IRModule = tvm.lower(schedule, args) +lbb = tvm.get_global_func("auto_scheduler.local_builder.build") +schtomod = tvm.get_global_func("driver.schedule_to_module") +lowersched = tvm.get_global_func("driver.lower_schedule") + + def get_gnn_features(task: SearchTask, states: List[State]): + # Prepare MeasureInputs for all states + # inputs = [MeasureInput(task, s).serialize() for s in states] + inputs = [MeasureInput(task, s) for s in states] + + # Use the local_builder_build function to build in parallel + features = lbb(inputs, 10, multiprocessing.cpu_count(), "default", 1) + + # Extract features from the build results + # features = [extract_features_from_build_result(res) for res in build_results] # Implement this function as needed + return features + +def get_gnn_features_sch2mod(task: SearchTask, states: List[State]): + # Serialize the inputs for parallel processing + serialized_inputs = [MeasureInput(task, s).serialize() for s in states] + + # Convert each serialized input to a module in parallel + ctx = multiprocessing.get_context('fork') + with multiprocessing.pool.Pool(multiprocessing.cpu_count(), context=ctx) as executor: + # Use tqdm to wrap the iterable for progress tracking + modules = list(tqdm(executor.imap(sched_to_mod, serialized_inputs), total=len(serialized_inputs))) + + # Extract features from the modules + features = [] + for module in modules: + # Assuming there's a function to extract features from a module + # feature = extract_features_from_module(module) + features.append("e") + + return features + +def sched_to_mod(serialized_arg): + minput = MeasureInput.deserialize(serialized_arg) + task, state = minput.task, minput.state + schedule, args = task.compute_dag.apply_steps_from_state(state) + # module = schtomod(schedule, args, "tmp_func", {}) + module = lowersched(schedule, args, "tmp_func", {}, False) + return module + +def get_gnn_features_old(task: SearchTask, states: List[State]): # parallel process all the states args = list(zip([(task)]*len(states), states)) @@ -134,13 +202,13 @@ def get_gnn_features(task: SearchTask, states: List[State]): inputs = [MeasureInput(task, s).serialize() for s in states] - # ctx = multiprocessing.get_context('fork') - - with multiprocessing.pool.Pool(multiprocessing.cpu_count()) as executor: + ctx = multiprocessing.get_context('fork') + with multiprocessing.pool.Pool(multiprocessing.cpu_count(), context=ctx) as executor: features = list(executor.map(gnn_feature_extractor_tup, inputs)) return features + class GNNModel(PythonBasedModel): """Train a GNN model that learns from the AST representation of a TIR program and predicts the performance of the program. @@ -293,18 +361,32 @@ def predict(self, task, states): scores: List[float] The predicted scores for all states """ + + # start_time_sch2mod = time.time() + # features = get_gnn_features_sch2mod(task, states) + # end_time_sch2mod = time.time() + # print(f"Time taken for get_gnn_features_sch2mod: {end_time_sch2mod - start_time_sch2mod:.6f} seconds") + + + # # Timing the first function call + # start_time_gnn = time.time() + # features = get_gnn_features(task, states) + # end_time_gnn = time.time() + # print(f"Time taken for get_gnn_features: {end_time_gnn - start_time_gnn:.6f} seconds") + + # start_time_gnn_old = time.time() + # features = get_gnn_features_old(task, states[:1]) + # end_time_gnn_old = time.time() + # print(f"Time taken for get_gnn_features_old: {end_time_gnn_old - start_time_gnn_old:.6f} seconds") - # Timing the first function call - start_time_gnn = time.time() - features = get_gnn_features(task, states) - end_time_gnn = time.time() - print(f"Time taken for get_gnn_features: {end_time_gnn - start_time_gnn:.6f} seconds") # Timing the second function call start_time_per_store = time.time() features = get_per_store_features_from_states(states, task) end_time_per_store = time.time() print(f"Time taken for get_per_store_features_from_states: {end_time_per_store - start_time_per_store:.6f} seconds") + + print("model?", self.bst is not None and len(self.inputs) > self.num_warmup_sample) if self.bst is not None and len(self.inputs) > self.num_warmup_sample: dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features) raw_preds = self.bst.predict(dtest)