Skip to content

Commit

Permalink
Merge pull request #418 from ruotianluo/adaptiveAverage
Browse files Browse the repository at this point in the history
Add SpatialAdaptiveAveragePooling.
  • Loading branch information
soumith authored Feb 21, 2017
2 parents e2084c1 + 8a553b7 commit 0d01aaa
Show file tree
Hide file tree
Showing 4 changed files with 562 additions and 0 deletions.
200 changes: 200 additions & 0 deletions lib/THCUNN/SpatialAdaptiveAveragePooling.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
#include "THCUNN.h"
#include "THCHalf.h"
#include "THCHalfAutoNumerics.cuh"
#include "THCAtomics.cuh"

#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
// #define START_IND(a,b,c) a * c / b
// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0


#define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit

/*
* Description:
* this function adaptively average pools an input 4D tensor along dimensions 2 and 3
* 4D input, 4D output
*/
template <typename T>
__global__ void adaptiveaveragepool(T *input, T *output,
int input_n, int input_h, int input_w,
int output_h, int output_w,
int strideh, int stridew,
int strided)
{
// iterators
int xx, yy;

// compute offsets based on thread/block ID
int o = blockIdx.x;
int i = o;
//int k = blockIdx.x % input_n;

int xx_start = threadIdx.x;
int xx_end = output_w;
const int xx_step = blockDim.x;

int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
int yy_end = output_h;
const int yy_step = blockDim.y*gridDim.y;
// select input/output plane
output = output + o*output_w*output_h;
input = input + i*strided;

// For all output pixels...
for(yy = yy_start; yy < yy_end; yy+=yy_step) {

int y_start = START_IND(yy, output_h, input_h);
int y_end = END_IND(yy, output_h, input_h);
int kH = y_end-y_start;

for(xx = xx_start; xx < xx_end; xx+=xx_step) {

int x_start = START_IND(xx, output_w, input_w);
int x_end = END_IND(xx, output_w, input_w);
int kW = x_end-x_start;

// Compute the average pooling
T *ptr_input = input + y_start*strideh + x_start*stridew;
T *ptr_output = output + yy*output_w + xx;
T sum = ScalarConvert<int, T>::to(0);
int kx, ky;
for(ky = 0; ky < kH; ++ky) {
for(kx = 0; kx < kW; ++kx) {
T val = ptr_input[kx*stridew];
sum += val;
}
ptr_input += strideh; // next input line
}
// Update output
*ptr_output = sum / kH / kW;
}
}
}

/*
* Description:
* this function computes the gradInput from gradOutput
*/
template <typename T>
__global__ void adaptiveaveragegradinput(
T *gradInput, T *gradOutput,
int input_n, int input_h, int input_w, int output_h, int output_w
)
{
// iterators
int x, y;

// compute offsets based on thread/block ID
int o = blockIdx.x;
int i = o;

int x_start = threadIdx.x;
int x_end = input_w;
int x_step = blockDim.x;

int y_start = blockDim.y*blockIdx.y + threadIdx.y;
int y_end = input_h;
int y_step = blockDim.y*gridDim.y;

// select input/output plane
gradOutput = gradOutput + o*output_w*output_h;
gradInput = gradInput + i*input_w*input_h;

// compute gradInput
for(y = y_start; y < y_end; y+=y_step) {

int yy_start = START_IND(y, input_h, output_h);
int yy_end = END_IND(y, input_h, output_h);
int kH = yy_end-yy_start;

for(x = x_start; x < x_end; x+=x_step) {

int xx_start = START_IND(x, input_w, output_w);
int xx_end = END_IND(x, input_w, output_w);
int kW = xx_end-xx_start;

// Compute the gradients
T *ptr_gradInput = gradInput + y*input_w + x;
T *ptr_gradOutput = gradOutput + yy_start*output_w + xx_start;

int kx, ky;
for(ky = 0; ky < kH; ++ky) {
int yy = yy_start + ky;
int kkH = START_IND(yy, output_h, input_h) - END_IND(yy, output_h, input_h);
for(kx = 0; kx < kW; ++kx) {
int xx = xx_start + kx;
int kkW = START_IND(xx, output_w, input_w) - END_IND(xx, output_w, input_w);
T z = ptr_gradOutput[kx + ky*output_w] / kkW / kkH;
*ptr_gradInput += z;
}
}
}
}
}

/*
* Description:
* this function computes the gradInput from gradOutput
* (uses atomic add)
*/
template <typename T>
__global__ void atomicadaptiveaveragegradinput(
T *gradInput, T *gradOutput,
int input_n, int input_h, int input_w, int output_h, int output_w
)
{
// iterators
int xx, yy;

// compute offsets based on thread/block ID
int o = blockIdx.x;
int i = o;

int xx_start = threadIdx.x;
int xx_end = output_w;
int xx_step = blockDim.x;

int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
int yy_end = output_h;
int yy_step = blockDim.y*gridDim.y;

// select input/output plane
gradOutput = gradOutput + o*output_w*output_h;
gradInput = gradInput + i*input_w*input_h;

// compute gradInput
for(yy = yy_start; yy < yy_end; yy+=yy_step) {

int y_start = START_IND(yy, output_h, input_h);
int y_end = END_IND(yy, output_h, input_h);
int kH = y_end-y_start;

for(xx = xx_start; xx < xx_end; xx+=xx_step) {

int x_start = START_IND(xx, output_w, input_w);
int x_end = END_IND(xx, output_w, input_w);
int kW = x_end-x_start;

// Compute the gradients
T *ptr_gradInput = gradInput + y_start*input_w + x_start;
T *ptr_gradOutput = gradOutput + yy*output_w + xx;
T z = *ptr_gradOutput / kW / kH;
int kx, ky;
for(ky = 0; ky < kH; ++ky) {
for(kx = 0; kx < kW; ++kx) {
// atomic add since different threads could update same variable
atomicAdd(&(ptr_gradInput[kx + ky*input_w]), z);
}
}
}
}
}

#include "generic/SpatialAdaptiveAveragePooling.cu"
#include "THCGenerateFloatTypes.h"

#undef CUDA_MAX_THREADS
#undef START_IND
#undef END_IND
173 changes: 173 additions & 0 deletions lib/THCUNN/generic/SpatialAdaptiveAveragePooling.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.cu"
#else

#include "../common.h"

void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
int nOutputCols,
int nOutputRows)
{
THCUNN_assertSameGPU(state, 2, input, output);

real *output_data;
real *input_data;

THCUNN_argCheck(state, input->nDimension == 3 || input->nDimension == 4, 2, input,
"3D or 4D (batch mode) tensor expected for input, but got: %s");

if (input->nDimension == 3) {
long nInputCols = input->size[2];
long nInputRows = input->size[1];
long nInputPlane = input->size[0];

long istride_d = input->stride[0];
long istride_h = input->stride[1];
long istride_w = input->stride[2];

input_data = THCTensor_(data)(state, input);

THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);

output_data = THCTensor_(data)(state, output);

// cuda blocks & threads:
int yblocks = (int)(16L / nInputPlane);
yblocks = yblocks < 1 ? 1 : yblocks;
dim3 blocks(nInputPlane,yblocks);
dim3 threads(32,8);

// run averagepool kernel
adaptiveaveragepool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
istride_h, istride_w, istride_d);
THCudaCheck(cudaGetLastError());

} else {
input = THCTensor_(newContiguous)(state, input);
long nInputCols = input->size[3];
long nInputRows = input->size[2];
long nInputPlane = input->size[1];
long nbatch = input->size[0];

long istride_d = input->stride[1];
long istride_h = input->stride[2];
long istride_w = input->stride[3];

input_data = THCTensor_(data)(state, input);

THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);

output_data = THCTensor_(data)(state, output);

// cuda blocks & threads:
int yblocks = (int)(16L / nInputPlane);
yblocks = yblocks < 1 ? 1 : yblocks;
dim3 blocks(nInputPlane*nbatch,yblocks);
dim3 threads(32,8);

// run averagepool kernel
adaptiveaveragepool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
istride_h, istride_w, istride_d);
THCudaCheck(cudaGetLastError());
// clean
THCTensor_(free)(state, input);
}
}

void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput)
{
bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests

THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);

real *gradInput_data;
real *gradOutput_data;

gradOutput = THCTensor_(newContiguous)(state, gradOutput);

if (input->nDimension == 3) {
long nInputCols = input->size[2];
long nInputRows = input->size[1];
long nInputPlane = input->size[0];
long nOutputCols = gradOutput->size[2];
long nOutputRows = gradOutput->size[1];

//bool atomic = (nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);

THCTensor_(resizeAs)(state, gradInput, input);
THCTensor_(zero)(state, gradInput);

gradOutput_data = THCTensor_(data)(state, gradOutput);
gradInput_data = THCTensor_(data)(state, gradInput);

// cuda blocks & threads:
int yblocks = (int)(16L / nInputPlane);
yblocks = yblocks < 1 ? 1 : yblocks;
dim3 blocks(nInputPlane,yblocks);
dim3 threads(32,8);

if(atomic)
{
// run updateGradInput kernel, accumulate gradients atomically
atomicadaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
}
else
{
// run updateGradInput kernel
adaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
}
THCudaCheck(cudaGetLastError());
} else {
long nInputCols = input->size[3];
long nInputRows = input->size[2];
long nInputPlane = input->size[1];
long nbatch = input->size[0];
long nOutputCols = gradOutput->size[3];
long nOutputRows = gradOutput->size[2];

//bool atomic = //(nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);

THCTensor_(resizeAs)(state, gradInput, input);
THCTensor_(zero)(state, gradInput);

gradOutput_data = THCTensor_(data)(state, gradOutput);
gradInput_data = THCTensor_(data)(state, gradInput);

// cuda blocks & threads:
int yblocks = (int)(16L / nInputPlane);
yblocks = yblocks < 1 ? 1 : yblocks;
dim3 blocks(nInputPlane*nbatch,yblocks);
dim3 threads(32,8);

if(atomic)
{
// run updateGradInput kernel, accumulate gradients atomically
atomicadaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
}
else
{
// run updateGradInput kernel, accumulate gradients atomically
adaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
}
THCudaCheck(cudaGetLastError());
}

// clean
THCTensor_(free)(state,gradOutput);

}

#endif
13 changes: 13 additions & 0 deletions lib/THCUNN/generic/THCUNN.h
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,19 @@ TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
THCTensor *gradInput,
THCIndexTensor *indices);

TH_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
int nOutputCols,
int nOutputRows);

TH_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput);

TH_API void THNN_(SpatialAveragePooling_updateOutput)(
THCState *state,
THCTensor *input,
Expand Down
Loading

0 comments on commit 0d01aaa

Please sign in to comment.