diff --git a/lib/THCUNN/SpatialAdaptiveAveragePooling.cu b/lib/THCUNN/SpatialAdaptiveAveragePooling.cu new file mode 100644 index 00000000..b1e5e5c6 --- /dev/null +++ b/lib/THCUNN/SpatialAdaptiveAveragePooling.cu @@ -0,0 +1,200 @@ +#include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#define START_IND(a,b,c) (int)floor((float)(a * c) / b) +#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) +// #define START_IND(a,b,c) a * c / b +// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 + + +#define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit + +/* + * Description: + * this function adaptively average pools an input 4D tensor along dimensions 2 and 3 + * 4D input, 4D output + */ + template +__global__ void adaptiveaveragepool(T *input, T *output, + int input_n, int input_h, int input_w, + int output_h, int output_w, + int strideh, int stridew, + int strided) +{ + // iterators + int xx, yy; + + // compute offsets based on thread/block ID + int o = blockIdx.x; + int i = o; + //int k = blockIdx.x % input_n; + + int xx_start = threadIdx.x; + int xx_end = output_w; + const int xx_step = blockDim.x; + + int yy_start = blockDim.y*blockIdx.y + threadIdx.y; + int yy_end = output_h; + const int yy_step = blockDim.y*gridDim.y; + // select input/output plane + output = output + o*output_w*output_h; + input = input + i*strided; + + // For all output pixels... + for(yy = yy_start; yy < yy_end; yy+=yy_step) { + + int y_start = START_IND(yy, output_h, input_h); + int y_end = END_IND(yy, output_h, input_h); + int kH = y_end-y_start; + + for(xx = xx_start; xx < xx_end; xx+=xx_step) { + + int x_start = START_IND(xx, output_w, input_w); + int x_end = END_IND(xx, output_w, input_w); + int kW = x_end-x_start; + + // Compute the average pooling + T *ptr_input = input + y_start*strideh + x_start*stridew; + T *ptr_output = output + yy*output_w + xx; + T sum = ScalarConvert::to(0); + int kx, ky; + for(ky = 0; ky < kH; ++ky) { + for(kx = 0; kx < kW; ++kx) { + T val = ptr_input[kx*stridew]; + sum += val; + } + ptr_input += strideh; // next input line + } + // Update output + *ptr_output = sum / kH / kW; + } + } +} + +/* + * Description: + * this function computes the gradInput from gradOutput + */ + template +__global__ void adaptiveaveragegradinput( + T *gradInput, T *gradOutput, + int input_n, int input_h, int input_w, int output_h, int output_w +) +{ + // iterators + int x, y; + + // compute offsets based on thread/block ID + int o = blockIdx.x; + int i = o; + + int x_start = threadIdx.x; + int x_end = input_w; + int x_step = blockDim.x; + + int y_start = blockDim.y*blockIdx.y + threadIdx.y; + int y_end = input_h; + int y_step = blockDim.y*gridDim.y; + + // select input/output plane + gradOutput = gradOutput + o*output_w*output_h; + gradInput = gradInput + i*input_w*input_h; + + // compute gradInput + for(y = y_start; y < y_end; y+=y_step) { + + int yy_start = START_IND(y, input_h, output_h); + int yy_end = END_IND(y, input_h, output_h); + int kH = yy_end-yy_start; + + for(x = x_start; x < x_end; x+=x_step) { + + int xx_start = START_IND(x, input_w, output_w); + int xx_end = END_IND(x, input_w, output_w); + int kW = xx_end-xx_start; + + // Compute the gradients + T *ptr_gradInput = gradInput + y*input_w + x; + T *ptr_gradOutput = gradOutput + yy_start*output_w + xx_start; + + int kx, ky; + for(ky = 0; ky < kH; ++ky) { + int yy = yy_start + ky; + int kkH = START_IND(yy, output_h, input_h) - END_IND(yy, output_h, input_h); + for(kx = 0; kx < kW; ++kx) { + int xx = xx_start + kx; + int kkW = START_IND(xx, output_w, input_w) - END_IND(xx, output_w, input_w); + T z = ptr_gradOutput[kx + ky*output_w] / kkW / kkH; + *ptr_gradInput += z; + } + } + } + } +} + +/* + * Description: + * this function computes the gradInput from gradOutput + * (uses atomic add) + */ + template +__global__ void atomicadaptiveaveragegradinput( + T *gradInput, T *gradOutput, + int input_n, int input_h, int input_w, int output_h, int output_w +) +{ + // iterators + int xx, yy; + + // compute offsets based on thread/block ID + int o = blockIdx.x; + int i = o; + + int xx_start = threadIdx.x; + int xx_end = output_w; + int xx_step = blockDim.x; + + int yy_start = blockDim.y*blockIdx.y + threadIdx.y; + int yy_end = output_h; + int yy_step = blockDim.y*gridDim.y; + + // select input/output plane + gradOutput = gradOutput + o*output_w*output_h; + gradInput = gradInput + i*input_w*input_h; + + // compute gradInput + for(yy = yy_start; yy < yy_end; yy+=yy_step) { + + int y_start = START_IND(yy, output_h, input_h); + int y_end = END_IND(yy, output_h, input_h); + int kH = y_end-y_start; + + for(xx = xx_start; xx < xx_end; xx+=xx_step) { + + int x_start = START_IND(xx, output_w, input_w); + int x_end = END_IND(xx, output_w, input_w); + int kW = x_end-x_start; + + // Compute the gradients + T *ptr_gradInput = gradInput + y_start*input_w + x_start; + T *ptr_gradOutput = gradOutput + yy*output_w + xx; + T z = *ptr_gradOutput / kW / kH; + int kx, ky; + for(ky = 0; ky < kH; ++ky) { + for(kx = 0; kx < kW; ++kx) { + // atomic add since different threads could update same variable + atomicAdd(&(ptr_gradInput[kx + ky*input_w]), z); + } + } + } + } +} + +#include "generic/SpatialAdaptiveAveragePooling.cu" +#include "THCGenerateFloatTypes.h" + +#undef CUDA_MAX_THREADS +#undef START_IND +#undef END_IND diff --git a/lib/THCUNN/generic/SpatialAdaptiveAveragePooling.cu b/lib/THCUNN/generic/SpatialAdaptiveAveragePooling.cu new file mode 100644 index 00000000..e444d1f8 --- /dev/null +++ b/lib/THCUNN/generic/SpatialAdaptiveAveragePooling.cu @@ -0,0 +1,173 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.cu" +#else + +#include "../common.h" + +void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int nOutputCols, + int nOutputRows) +{ + THCUNN_assertSameGPU(state, 2, input, output); + + real *output_data; + real *input_data; + + THCUNN_argCheck(state, input->nDimension == 3 || input->nDimension == 4, 2, input, + "3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->nDimension == 3) { + long nInputCols = input->size[2]; + long nInputRows = input->size[1]; + long nInputPlane = input->size[0]; + + long istride_d = input->stride[0]; + long istride_h = input->stride[1]; + long istride_w = input->stride[2]; + + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols); + + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane,yblocks); + dim3 threads(32,8); + + // run averagepool kernel + adaptiveaveragepool <<>> (input_data, output_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + istride_h, istride_w, istride_d); + THCudaCheck(cudaGetLastError()); + + } else { + input = THCTensor_(newContiguous)(state, input); + long nInputCols = input->size[3]; + long nInputRows = input->size[2]; + long nInputPlane = input->size[1]; + long nbatch = input->size[0]; + + long istride_d = input->stride[1]; + long istride_h = input->stride[2]; + long istride_w = input->stride[3]; + + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols); + + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane*nbatch,yblocks); + dim3 threads(32,8); + + // run averagepool kernel + adaptiveaveragepool <<>> (input_data, output_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + istride_h, istride_w, istride_d); + THCudaCheck(cudaGetLastError()); + // clean + THCTensor_(free)(state, input); + } +} + +void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput) +{ + bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests + + THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); + + real *gradInput_data; + real *gradOutput_data; + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + if (input->nDimension == 3) { + long nInputCols = input->size[2]; + long nInputRows = input->size[1]; + long nInputPlane = input->size[0]; + long nOutputCols = gradOutput->size[2]; + long nOutputRows = gradOutput->size[1]; + + //bool atomic = (nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + gradOutput_data = THCTensor_(data)(state, gradOutput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane,yblocks); + dim3 threads(32,8); + + if(atomic) + { + // run updateGradInput kernel, accumulate gradients atomically + atomicadaptiveaveragegradinput <<>> (gradInput_data, gradOutput_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); + } + else + { + // run updateGradInput kernel + adaptiveaveragegradinput <<>> (gradInput_data, gradOutput_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); + } + THCudaCheck(cudaGetLastError()); + } else { + long nInputCols = input->size[3]; + long nInputRows = input->size[2]; + long nInputPlane = input->size[1]; + long nbatch = input->size[0]; + long nOutputCols = gradOutput->size[3]; + long nOutputRows = gradOutput->size[2]; + + //bool atomic = //(nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + gradOutput_data = THCTensor_(data)(state, gradOutput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane*nbatch,yblocks); + dim3 threads(32,8); + + if(atomic) + { + // run updateGradInput kernel, accumulate gradients atomically + atomicadaptiveaveragegradinput <<>> (gradInput_data, gradOutput_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); + } + else + { + // run updateGradInput kernel, accumulate gradients atomically + adaptiveaveragegradinput <<>> (gradInput_data, gradOutput_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); + } + THCudaCheck(cudaGetLastError()); + } + + // clean + THCTensor_(free)(state,gradOutput); + +} + +#endif diff --git a/lib/THCUNN/generic/THCUNN.h b/lib/THCUNN/generic/THCUNN.h index fabb8e93..a71209a2 100644 --- a/lib/THCUNN/generic/THCUNN.h +++ b/lib/THCUNN/generic/THCUNN.h @@ -394,6 +394,19 @@ TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( THCTensor *gradInput, THCIndexTensor *indices); +TH_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int nOutputCols, + int nOutputRows); + +TH_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput); + TH_API void THNN_(SpatialAveragePooling_updateOutput)( THCState *state, THCTensor *input, diff --git a/test.lua b/test.lua index 8c892761..40b25e43 100644 --- a/test.lua +++ b/test.lua @@ -2965,6 +2965,182 @@ function cunntest.SpatialAdaptiveMaxPooling_backward_batch() end end +function cunntest.SpatialAdaptiveAveragePooling_forward() + local from = math.random(1,64) + local to = from + local outi = math.random(2,64) + local outj = math.random(2,64) + local ini = math.random(10,256) + local inj = math.random(10,256) + + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local ctype = t2cpu[typename] + input = makeNonContiguous(input:type(ctype)) + local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype) + local groundtruth = sconv:forward(input):type(ctype) + + input = makeNonContiguous(input:type(typename)) + local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename) + local rescuda = gconv:forward(input) + + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end +end + +function cunntest.SpatialAdaptiveAveragePooling_forward_noncontig() + local from = math.random(1,64) + local to = from + local outi = math.random(2,64) + local outj = math.random(2,64) + local ini = math.random(10,256) + local inj = math.random(10,256) + + for k, typename in ipairs(typenames) do + local input0 = torch.randn(from,ini,inj):type(typename) + local ctype = t2cpu[typename] + local input = makeNonContiguous(input0:type(ctype):transpose(2,3)) + local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype) + local groundtruth = sconv:forward(input) + + input = makeNonContiguous(input0:type(typename):transpose(2,3)) + local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename) + local rescuda = gconv:forward(input) + + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end +end + +function cunntest.SpatialAdaptiveAveragePooling_forward_batch() + local bs = math.random(4,10) + local from = math.random(1,48) + local to = from + local outi = math.random(2,48) + local outj = math.random(2,48) + local ini = math.random(10,256) + local inj = math.random(10,256) + + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local ctype = t2cpu[typename] + input = makeNonContiguous(input:type(ctype)) + local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype) + local groundtruth = sconv:forward(input) + + input = makeNonContiguous(input:type(typename)) + local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename) + local rescuda = gconv:forward(input) + + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end +end + +function cunntest.SpatialAdaptiveAveragePooling_backward() + local from = math.random(1,64) + local to = from + local outi = math.random(2,64) + local outj = math.random(2,64) + local ini = math.random(10,256) + local inj = math.random(10,256) + + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local gradOutput = torch.randn(to,outj,outi):type(typename) + local ctype = t2cpu[typename] + input = makeNonContiguous(input:type(ctype)) + gradOutput = makeNonContiguous(gradOutput:type(ctype)) + local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype) + sconv:forward(input) + sconv:zeroGradParameters() + local groundgrad = sconv:backward(input, gradOutput) + + input = makeNonContiguous(input:type(typename)) + gradOutput = makeNonContiguous(gradOutput:type(typename)) + local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename) + gconv:forward(input) + gconv:zeroGradParameters() + local rescuda = gconv:backward(input, gradOutput) + + local error = rescuda:double() - groundgrad:double() + + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end +end + +function cunntest.SpatialAdaptiveAveragePooling_backward_noncontig() + local from = math.random(1,64) + local to = from + local outi = math.random(2,64) + local outj = math.random(2,64) + local ini = math.random(10,256) + local inj = math.random(10,256) + + for k, typename in ipairs(typenames) do + local input0 = torch.randn(from,ini,inj):type(typename) + local gradOutput = torch.randn(to,outj,outi):type(typename) + local ctype = t2cpu[typename] + local input = makeNonContiguous(input0:type(ctype):transpose(2,3)) + gradOutput = makeNonContiguous(gradOutput:type(ctype)) + local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype) + sconv:forward(input) + sconv:zeroGradParameters() + local groundgrad = sconv:backward(input, gradOutput) + + input = makeNonContiguous(input0:type(typename):transpose(2,3)) + gradOutput = makeNonContiguous(gradOutput:type(typename)) + local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename) + gconv:forward(input) + gconv:zeroGradParameters() + local rescuda = gconv:backward(input, gradOutput) + + local error = rescuda:double() - groundgrad:double() + + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end +end + +function cunntest.SpatialAdaptiveAveragePooling_backward_batch() + local bs = math.random(4,10) + local from = math.random(1,64) + local to = from + local outi = math.random(2,64) + local outj = math.random(2,64) + local ini = math.random(10,256) + local inj = math.random(10,256) + + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local gradOutput = torch.randn(bs,to,outj,outi):type(typename) + local ctype = t2cpu[typename] + input = makeNonContiguous(input:type(ctype)) + gradOutput = makeNonContiguous(gradOutput:type(ctype)) + local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype) + sconv:forward(input) + sconv:zeroGradParameters() + local groundgrad = sconv:backward(input, gradOutput) + + input = makeNonContiguous(input:type(typename)) + gradOutput = makeNonContiguous(gradOutput:type(typename)) + local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename) + gconv:forward(input) + gconv:zeroGradParameters() + local rescuda = gconv:backward(input, gradOutput) + + local error = rescuda:double() - groundgrad:double() + + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end +end + function cunntest.SpatialLPPooling_forward() local from = math.random(1,64) local to = from