diff --git a/lib/THCUNN/SpatialAdaptiveAveragePooling.cu b/lib/THCUNN/SpatialAdaptiveAveragePooling.cu
new file mode 100644
index 00000000..b1e5e5c6
--- /dev/null
+++ b/lib/THCUNN/SpatialAdaptiveAveragePooling.cu
@@ -0,0 +1,200 @@
+#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+
+#define CUDA_MAX_THREADS 1024   // this is safe, in reality 256 is our limit
+
+/*
+ * Description:
+ *    this function adaptively average pools an input 4D tensor along dimensions 2 and 3
+ *    4D input, 4D output
+ */
+ template <typename T>
+__global__ void adaptiveaveragepool(T *input, T *output,
+                        int input_n, int input_h, int input_w,
+                        int output_h, int output_w,
+                        int strideh, int stridew,
+                        int strided)
+{
+  // iterators
+  int xx, yy;
+
+  // compute offsets based on thread/block ID
+  int o = blockIdx.x;
+  int i = o;
+  //int k = blockIdx.x % input_n;
+
+  int xx_start = threadIdx.x;
+  int xx_end = output_w;
+  const int xx_step = blockDim.x;
+
+  int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
+  int yy_end = output_h;
+  const int yy_step = blockDim.y*gridDim.y;
+  // select input/output plane
+  output = output + o*output_w*output_h;
+  input = input + i*strided;
+
+  // For all output pixels...
+  for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+
+    int y_start = START_IND(yy, output_h, input_h);
+    int y_end   = END_IND(yy, output_h, input_h);
+    int kH = y_end-y_start;
+
+    for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+
+      int x_start = START_IND(xx, output_w, input_w);
+      int x_end   = END_IND(xx, output_w, input_w);
+      int kW = x_end-x_start;
+
+      // Compute the average pooling
+      T *ptr_input = input + y_start*strideh + x_start*stridew;
+      T *ptr_output = output + yy*output_w + xx;
+      T sum = ScalarConvert<int, T>::to(0);
+      int kx, ky;
+      for(ky = 0; ky < kH; ++ky) {
+        for(kx = 0; kx < kW; ++kx) {
+          T val = ptr_input[kx*stridew];
+          sum += val;
+        }
+        ptr_input += strideh; // next input line
+      }
+      // Update output
+      *ptr_output = sum / kH / kW;
+    }
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradInput from gradOutput
+ */
+ template <typename T>
+__global__ void adaptiveaveragegradinput(
+  T *gradInput, T *gradOutput,
+  int input_n, int input_h, int input_w, int output_h, int output_w
+)
+{
+  // iterators
+  int x, y;
+
+  // compute offsets based on thread/block ID
+  int o = blockIdx.x;
+  int i = o;
+
+  int x_start = threadIdx.x;
+  int x_end = input_w;
+  int x_step = blockDim.x;
+
+  int y_start = blockDim.y*blockIdx.y + threadIdx.y;
+  int y_end = input_h;
+  int y_step = blockDim.y*gridDim.y;
+
+  // select input/output plane
+  gradOutput = gradOutput + o*output_w*output_h;
+  gradInput = gradInput + i*input_w*input_h;
+
+  // compute gradInput
+  for(y = y_start; y < y_end; y+=y_step) {
+
+    int yy_start = START_IND(y, input_h, output_h);
+    int yy_end   = END_IND(y, input_h, output_h);
+    int kH = yy_end-yy_start;
+
+    for(x = x_start; x < x_end; x+=x_step) {
+
+      int xx_start = START_IND(x, input_w, output_w);
+      int xx_end   = END_IND(x, input_w, output_w);
+      int kW = xx_end-xx_start;
+
+      // Compute the gradients
+      T *ptr_gradInput = gradInput + y*input_w + x;
+      T *ptr_gradOutput = gradOutput + yy_start*output_w + xx_start;
+      
+      int kx, ky;
+      for(ky = 0; ky < kH; ++ky) {
+        int yy = yy_start + ky;
+        int kkH = START_IND(yy, output_h, input_h) - END_IND(yy, output_h, input_h);
+        for(kx = 0; kx < kW; ++kx) {
+          int xx = xx_start + kx;
+          int kkW = START_IND(xx, output_w, input_w) - END_IND(xx, output_w, input_w);
+          T z = ptr_gradOutput[kx + ky*output_w] / kkW / kkH;
+          *ptr_gradInput += z;
+        }
+      }
+    }
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradInput from gradOutput
+ *    (uses atomic add)
+ */
+ template <typename T>
+__global__ void atomicadaptiveaveragegradinput(
+  T *gradInput, T *gradOutput,
+  int input_n, int input_h, int input_w, int output_h, int output_w
+)
+{
+  // iterators
+  int xx, yy;
+
+  // compute offsets based on thread/block ID
+  int o = blockIdx.x;
+  int i = o;
+
+  int xx_start = threadIdx.x;
+  int xx_end = output_w;
+  int xx_step = blockDim.x;
+
+  int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
+  int yy_end = output_h;
+  int yy_step = blockDim.y*gridDim.y;
+
+  // select input/output plane
+  gradOutput = gradOutput + o*output_w*output_h;
+  gradInput = gradInput + i*input_w*input_h;
+
+  // compute gradInput
+  for(yy = yy_start; yy < yy_end; yy+=yy_step) {
+
+    int y_start = START_IND(yy, output_h, input_h);
+    int y_end   = END_IND(yy, output_h, input_h);
+    int kH = y_end-y_start;
+
+    for(xx = xx_start; xx < xx_end; xx+=xx_step) {
+
+      int x_start = START_IND(xx, output_w, input_w);
+      int x_end   = END_IND(xx, output_w, input_w);
+      int kW = x_end-x_start;
+
+      // Compute the gradients
+      T *ptr_gradInput = gradInput + y_start*input_w + x_start;
+      T *ptr_gradOutput = gradOutput + yy*output_w + xx;
+      T z = *ptr_gradOutput / kW / kH;
+      int kx, ky;
+      for(ky = 0; ky < kH; ++ky) {
+        for(kx = 0; kx < kW; ++kx) {
+          // atomic add since different threads could update same variable
+          atomicAdd(&(ptr_gradInput[kx + ky*input_w]), z);
+        }
+      }
+    }
+  }
+}
+
+#include "generic/SpatialAdaptiveAveragePooling.cu"
+#include "THCGenerateFloatTypes.h"
+
+#undef CUDA_MAX_THREADS
+#undef START_IND
+#undef END_IND
diff --git a/lib/THCUNN/generic/SpatialAdaptiveAveragePooling.cu b/lib/THCUNN/generic/SpatialAdaptiveAveragePooling.cu
new file mode 100644
index 00000000..e444d1f8
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialAdaptiveAveragePooling.cu
@@ -0,0 +1,173 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int nOutputCols,
+           int nOutputRows)
+{
+  THCUNN_assertSameGPU(state, 2, input, output);
+
+  real *output_data;
+  real *input_data;
+
+  THCUNN_argCheck(state, input->nDimension == 3 || input->nDimension == 4, 2, input,
+                  "3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 3) {
+    long nInputCols = input->size[2];
+    long nInputRows = input->size[1];
+    long nInputPlane = input->size[0];
+
+    long istride_d = input->stride[0];
+    long istride_h = input->stride[1];
+    long istride_w = input->stride[2];
+
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane,yblocks);
+    dim3 threads(32,8);
+
+    // run averagepool kernel
+    adaptiveaveragepool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
+                                   nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+                                   istride_h, istride_w, istride_d);
+    THCudaCheck(cudaGetLastError());
+
+  } else {
+    input = THCTensor_(newContiguous)(state, input);
+    long nInputCols = input->size[3];
+    long nInputRows = input->size[2];
+    long nInputPlane = input->size[1];
+    long nbatch = input->size[0];
+
+    long istride_d = input->stride[1];
+    long istride_h = input->stride[2];
+    long istride_w = input->stride[3];
+
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
+
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane*nbatch,yblocks);
+    dim3 threads(32,8);
+
+    // run averagepool kernel
+    adaptiveaveragepool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
+                                   nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+                                   istride_h, istride_w, istride_d);
+    THCudaCheck(cudaGetLastError());
+    // clean
+    THCTensor_(free)(state, input);
+  }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput)
+{
+  bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests
+
+  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
+
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  if (input->nDimension == 3) {
+    long nInputCols = input->size[2];
+    long nInputRows = input->size[1];
+    long nInputPlane = input->size[0];
+    long nOutputCols = gradOutput->size[2];
+    long nOutputRows = gradOutput->size[1];
+
+    //bool atomic = (nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+
+    gradOutput_data = THCTensor_(data)(state, gradOutput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane,yblocks);
+    dim3 threads(32,8);
+
+    if(atomic)
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      atomicadaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+    }
+    else
+    {
+      // run updateGradInput kernel
+      adaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+    }
+    THCudaCheck(cudaGetLastError());
+  } else {
+    long nInputCols = input->size[3];
+    long nInputRows = input->size[2];
+    long nInputPlane = input->size[1];
+    long nbatch = input->size[0];
+    long nOutputCols = gradOutput->size[3];
+    long nOutputRows = gradOutput->size[2];
+
+    //bool atomic = //(nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+
+    gradOutput_data = THCTensor_(data)(state, gradOutput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane*nbatch,yblocks);
+    dim3 threads(32,8);
+
+    if(atomic)
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      atomicadaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+    }
+    else
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      adaptiveaveragegradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+
+  // clean
+  THCTensor_(free)(state,gradOutput);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/THCUNN.h b/lib/THCUNN/generic/THCUNN.h
index fabb8e93..a71209a2 100644
--- a/lib/THCUNN/generic/THCUNN.h
+++ b/lib/THCUNN/generic/THCUNN.h
@@ -394,6 +394,19 @@ TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
                   THCTensor *gradInput,
                   THCIndexTensor *indices);
 
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int nOutputCols,
+                  int nOutputRows);
+
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput);
+
 TH_API void THNN_(SpatialAveragePooling_updateOutput)(
                   THCState *state,
                   THCTensor *input,
diff --git a/test.lua b/test.lua
index 8c892761..40b25e43 100644
--- a/test.lua
+++ b/test.lua
@@ -2965,6 +2965,182 @@ function cunntest.SpatialAdaptiveMaxPooling_backward_batch()
    end
 end
 
+function cunntest.SpatialAdaptiveAveragePooling_forward()
+   local from = math.random(1,64)
+   local to = from
+   local outi = math.random(2,64)
+   local outj = math.random(2,64)
+   local ini = math.random(10,256)
+   local inj = math.random(10,256)
+
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = makeNonContiguous(input:type(ctype))
+      local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype)
+      local groundtruth = sconv:forward(input):type(ctype)
+
+      input = makeNonContiguous(input:type(typename))
+      local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename)
+      local rescuda = gconv:forward(input)
+
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
+end
+
+function cunntest.SpatialAdaptiveAveragePooling_forward_noncontig()
+   local from = math.random(1,64)
+   local to = from
+   local outi = math.random(2,64)
+   local outj = math.random(2,64)
+   local ini = math.random(10,256)
+   local inj = math.random(10,256)
+
+   for k, typename in ipairs(typenames) do
+      local input0 = torch.randn(from,ini,inj):type(typename)
+      local ctype = t2cpu[typename]
+      local input = makeNonContiguous(input0:type(ctype):transpose(2,3))
+      local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype)
+      local groundtruth = sconv:forward(input)
+
+      input = makeNonContiguous(input0:type(typename):transpose(2,3))
+      local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename)
+      local rescuda = gconv:forward(input)
+
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
+end
+
+function cunntest.SpatialAdaptiveAveragePooling_forward_batch()
+   local bs = math.random(4,10)
+   local from = math.random(1,48)
+   local to = from
+   local outi = math.random(2,48)
+   local outj = math.random(2,48)
+   local ini = math.random(10,256)
+   local inj = math.random(10,256)
+
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = makeNonContiguous(input:type(ctype))
+      local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype)
+      local groundtruth = sconv:forward(input)
+
+      input = makeNonContiguous(input:type(typename))
+      local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename)
+      local rescuda = gconv:forward(input)
+
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
+end
+
+function cunntest.SpatialAdaptiveAveragePooling_backward()
+   local from = math.random(1,64)
+   local to = from
+   local outi = math.random(2,64)
+   local outj = math.random(2,64)
+   local ini = math.random(10,256)
+   local inj = math.random(10,256)
+
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
+      local gradOutput = torch.randn(to,outj,outi):type(typename)
+      local ctype = t2cpu[typename]
+      input = makeNonContiguous(input:type(ctype))
+      gradOutput = makeNonContiguous(gradOutput:type(ctype))
+      local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype)
+      sconv:forward(input)
+      sconv:zeroGradParameters()
+      local groundgrad = sconv:backward(input, gradOutput)
+
+      input = makeNonContiguous(input:type(typename))
+      gradOutput = makeNonContiguous(gradOutput:type(typename))
+      local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename)
+      gconv:forward(input)
+      gconv:zeroGradParameters()
+      local rescuda = gconv:backward(input, gradOutput)
+
+      local error = rescuda:double() - groundgrad:double()
+
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+   end
+end
+
+function cunntest.SpatialAdaptiveAveragePooling_backward_noncontig()
+   local from = math.random(1,64)
+   local to = from
+   local outi = math.random(2,64)
+   local outj = math.random(2,64)
+   local ini = math.random(10,256)
+   local inj = math.random(10,256)
+
+   for k, typename in ipairs(typenames) do
+      local input0 = torch.randn(from,ini,inj):type(typename)
+      local gradOutput = torch.randn(to,outj,outi):type(typename)
+      local ctype = t2cpu[typename]
+      local input = makeNonContiguous(input0:type(ctype):transpose(2,3))
+      gradOutput = makeNonContiguous(gradOutput:type(ctype))
+      local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype)
+      sconv:forward(input)
+      sconv:zeroGradParameters()
+      local groundgrad = sconv:backward(input, gradOutput)
+
+      input = makeNonContiguous(input0:type(typename):transpose(2,3))
+      gradOutput = makeNonContiguous(gradOutput:type(typename))
+      local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename)
+      gconv:forward(input)
+      gconv:zeroGradParameters()
+      local rescuda = gconv:backward(input, gradOutput)
+
+      local error = rescuda:double() - groundgrad:double()
+
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+   end
+end
+
+function cunntest.SpatialAdaptiveAveragePooling_backward_batch()
+   local bs = math.random(4,10)
+   local from = math.random(1,64)
+   local to = from
+   local outi = math.random(2,64)
+   local outj = math.random(2,64)
+   local ini = math.random(10,256)
+   local inj = math.random(10,256)
+
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
+      local ctype = t2cpu[typename]
+      input = makeNonContiguous(input:type(ctype))
+      gradOutput = makeNonContiguous(gradOutput:type(ctype))
+      local sconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(ctype)
+      sconv:forward(input)
+      sconv:zeroGradParameters()
+      local groundgrad = sconv:backward(input, gradOutput)
+
+      input = makeNonContiguous(input:type(typename))
+      gradOutput = makeNonContiguous(gradOutput:type(typename))
+      local gconv = nn.SpatialAdaptiveAveragePooling(outi,outj):type(typename)
+      gconv:forward(input)
+      gconv:zeroGradParameters()
+      local rescuda = gconv:backward(input, gradOutput)
+
+      local error = rescuda:double() - groundgrad:double()
+
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+   end
+end
+
 function cunntest.SpatialLPPooling_forward()
    local from = math.random(1,64)
    local to = from