From 64496ed4d9cc264428689f1315c38d8b904af8a6 Mon Sep 17 00:00:00 2001
From: Egor Burkov <shrubb@ya.ru>
Date: Fri, 4 Aug 2017 22:14:48 +0300
Subject: [PATCH 1/2] Faster SpatialDepthWiseConvolution

---
 .../generic/SpatialDepthWiseConvolution.cu    | 699 +++++++++---------
 lib/THCUNN/im2col.h                           | 116 ++-
 2 files changed, 438 insertions(+), 377 deletions(-)
diff --git a/lib/THCUNN/generic/SpatialDepthWiseConvolution.cu b/lib/THCUNN/generic/SpatialDepthWiseConvolution.cu
index 68077ed5..d324b00c 100644
--- a/lib/THCUNN/generic/SpatialDepthWiseConvolution.cu
+++ b/lib/THCUNN/generic/SpatialDepthWiseConvolution.cu
@@ -2,6 +2,62 @@
 #define THC_GENERIC_FILE "generic/SpatialDepthWiseConvolution.cu"
 #else
 
+#include "common.h"
+#include <vector>
+
+// Helper for `updateOutput`. Fills `output` with constant (bias) values
+__global__ void fillOutputWithBias(
+  real *output, const int batchSize, const int elementsPerPlane,
+  const real *bias, const int nInputPlane, const int nOutputPlane) {
+
+  // `output` is of size
+  // (batchSize) x (nInputPlane) x (nOutputPlane) x (outputHeight*outputWidth)
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < batchSize * nInputPlane * nOutputPlane * elementsPerPlane) {
+    real *outputPixel = &output[index];
+
+    index /= elementsPerPlane; index %= nInputPlane * nOutputPlane; // index of the output channel
+    int outPlaneIdx = index % nOutputPlane;
+    int  inPlaneIdx = index / nOutputPlane;
+
+    // bias is of size (nOutputPlane) x (nInputPlane)
+    *outputPixel = bias[outPlaneIdx*nInputPlane + inPlaneIdx];
+  }
+}
+
+// Transposes `tensor` pseudo-in-place using `buffer`.
+// `buffer` is enlarged if needed.
+// This function REQUIRES `buffer` to be non-NULL.
+void transposeWithBuffer(
+  THCState *state, THCTensor *tensor, THCStorage *buffer,
+  const int dim1, const int dim2) {
+
+  THAssert(buffer != NULL);
+
+  THCTensor *tensor_viewT = THCTensor_(newTranspose)(state, tensor, dim1, dim2);
+  
+  // Size of `buffer` (== `columns`, for example) should be
+  // enough in general, but who knows, so let `setStorageNd` ensure
+  THCTensor *bufferTensorT = THCTensor_(new)(state);
+  THCTensor_(setStorageNd)(state, bufferTensorT, buffer, 0, 
+    tensor_viewT->nDimension, tensor_viewT->size, NULL);
+
+  // This makes a contiguous tensor from `tensor_viewT`, i.e. does the actual transpose
+  THCTensor_(copy)(state, bufferTensorT, tensor_viewT);
+  // Copy the transposed data back
+  THCTensor_(copy)(state, tensor, bufferTensorT);
+
+  // Now reshape `tensor` to match the transposed size
+  std::vector<long> newSize(tensor->size, tensor->size + tensor->nDimension);
+  std::swap(newSize[dim1], newSize[dim2]);
+  THCTensor_(setStorageNd)(state, tensor, tensor->storage,
+    tensor->storageOffset, tensor->nDimension, newSize.data(), NULL);
+
+  THCTensor_(free)(state, tensor_viewT);
+  THCTensor_(free)(state, bufferTensorT);
+}
+
 static inline void THNN_(SpatialDepthWiseConvolution_shapeCheck)(
                          THCState *state,
                          THCTensor *input, THCTensor *gradOutput,
@@ -55,6 +111,58 @@ static inline void THNN_(SpatialDepthWiseConvolution_shapeCheck)(
   }
 }
 
+__global__ void updateOutputKernel(
+      real *output          , const real *input     , 
+      const real *weight    , const int batchSize   ,
+      const int inputHeight , const int inputWidth  ,
+      const int nInputPlane , const int nOutputPlane,
+      const int outputHeight, const int outputWidth ,
+      const int kH          , const int kW          ,
+      const int padH        , const int padW        ,
+      const int strideH     , const int strideW     ,
+      const int dilationH   , const int dilationW) {
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < batchSize * nInputPlane * nOutputPlane * outputHeight * outputWidth) {
+
+    const int outIndex = index;
+    // grid is (batchSize) x (nInputPlane) x (nOutputPlane) x (outputHeight) x (outputWidth)
+    const int wOut        = index % outputWidth ; index /= outputWidth ;
+    const int hOut        = index % outputHeight; index /= outputHeight;
+    const int outPlaneIdx = index % nOutputPlane; index /= nOutputPlane;
+    const int inPlaneIdx  = index % nInputPlane ; index /= nInputPlane ;
+    // `index` is now the index of the sample in batch
+    const int & sampleIdx = index;
+
+    const int hInStart = hOut * strideH - padH;
+    const int wInStart = wOut * strideW - padW;
+    input += ((sampleIdx 
+      * nInputPlane + inPlaneIdx)
+        * inputHeight + hInStart) 
+          * inputWidth + wInStart;
+
+    // weight is (nInputPlane) x (nOutputPlane) x (kH) x (kW)
+    weight += (inPlaneIdx * nOutputPlane + outPlaneIdx) * kH * kW;
+
+    accreal result = 0;
+
+    for (int i = 0; i < kH; ++i) {
+      for (int j = 0; j < kW; ++j) {
+        const int h = hInStart + i * dilationH;
+        const int w = wInStart + j * dilationW;
+        result += 
+          (h >= 0 && w >= 0 && h < inputHeight && w < inputWidth) ?
+          input[i * dilationH * inputWidth + j * dilationW] * (*weight) :
+          ScalarConvert<int, real>::to(0);
+        ++weight;
+      }
+    }
+
+    output[outIndex] += ScalarConvert<accreal, real>::to(result);
+  }
+}
+
 void THNN_(SpatialDepthWiseConvolution_updateOutput)(
            THCState *state,
            THCTensor *input,
@@ -82,23 +190,10 @@ void THNN_(SpatialDepthWiseConvolution_updateOutput)(
   THNN_(SpatialDepthWiseConvolution_shapeCheck)
        (state, input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
 
-
-  // Transpose weight & bias
-  THCTensor *_weight = THCTensor_(newTranspose)(state, weight, 0, 1);
-  weight = THCTensor_(newContiguous)(state, _weight);
-
-  THCTensor *_bias = NULL;
-  if(bias) {
-    _bias = THCTensor_(newTranspose)(state, bias, 0, 1);
-    bias = THCTensor_(newContiguous)(state, _bias);
-  }
-
-  // resize weight
-  long s1 = weight->size[0];
-  long s2 = weight->size[1];
-  long s3 = weight->size[2] * weight->size[3];
-  weight = THCTensor_(newWithStorage3d)(state, weight->storage, weight->storageOffset,
-          s1, -1, s2, -1, s3, -1);
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
 
   input = THCTensor_(newContiguous)(state, input);
 
@@ -109,138 +204,55 @@ void THNN_(SpatialDepthWiseConvolution_updateOutput)(
     THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
   }
 
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
   // Batch size + input planes
   long batchSize = input->size[0];
 
   // Resize output
   THCTensor_(resize5d)(state, output, batchSize, nInputPlane, nOutputPlane, outputHeight, outputWidth);
-
   // Resize temporary columns
-  THCTensor_(resize2d)(state, columns, kW*kH, outputHeight*outputWidth);
+  THCTensor_(resizeAs)(state, columns, weight); // reserve size for transpose
 
-  // Define a buffer of ones, for bias accumulation
-  // Note: this buffer can be shared with other modules, it only ever gets increased,
-  // and always contains ones.
-  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
-    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
-  }
+  transposeWithBuffer(state, weight, columns->storage, 0, 1);
 
   // Helpers
-  THCTensor *input_n = THCTensor_(new)(state);
-  THCTensor *output_n = THCTensor_(new)(state);
-
-
-  // Helpers for DepthWiseConvolution
-  THCTensor *input_i = THCTensor_(new)(state);
-  THCTensor *output_i = THCTensor_(new)(state);
+  THCTensor *outputTransposed_i = THCTensor_(new)(state);
   THCTensor *weight_i = THCTensor_(new)(state);
 
-  THCTensor *bias_i = NULL;
-  if(bias) {
-    bias_i = THCTensor_(new)(state);
-  }
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCTensor_(select)(state, input_n, input, 0, elt);
-    THCTensor_(select)(state, output_n, output, 0, elt);
-
-
-    for (int ipelt = 0; ipelt < nInputPlane; ipelt++)
-    {
-      // Fetch ipelt-th input plane
-      THCTensor_(narrow)(state, input_i, input_n, 0, ipelt, 1);
-      THCTensor_(select)(state, output_i, output_n, 0, ipelt);
-      THCTensor_(select)(state, weight_i, weight, 0, ipelt);
-      if (bias) {
-        THCTensor_(select)(state, bias_i, bias, 0, ipelt);
-      }
-      // Do Bias first:
-      // M,N,K are dims of matrix A and B
-      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-      long m_ = nOutputPlane;
-      long n_ = outputHeight * outputWidth;
-      long k_ = 1;
-
-      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-      if (bias) {
-        #ifdef THC_REAL_IS_FLOAT
-        THCudaBlas_Sgemm(
-        #elif defined(THC_REAL_IS_HALF)
-        THCudaBlas_Hgemm(
-        #elif defined(THC_REAL_IS_DOUBLE)
-        THCudaBlas_Dgemm(
-        #endif
-            state,
-            't', 'n',
-            n_, m_, k_,
-            ScalarConvert<int, real>::to(1),
-            THCTensor_(data)(state, ones), k_,
-            THCTensor_(data)(state, bias_i), k_,
-            ScalarConvert<int, real>::to(0),
-            THCTensor_(data)(state, output_i), n_
-        );
-      } else {
-        THCTensor_(zero)(state, output_i);
-      }
-
-      // Extract columns:
-      im2col(
-        THCState_getCurrentStream(state),
-        THCTensor_(data)(state, input_i),
-        1, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-        1, 1, THCTensor_(data)(state, columns)
-      );
-
-      // M,N,K are dims of matrix A and B
-      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-      long m = nOutputPlane;
-      long n = columns->size[1];
-      long k = 1*kH*kW;
+  // Make sure `ones` buffer is at least as large as `output`
+  // THCTensor *outputTransposed = ones;
+  THCTensor_(resize4d)(state, ones, 
+    nInputPlane, nOutputPlane, batchSize, outputHeight*outputWidth);
 
-      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-      #ifdef THC_REAL_IS_FLOAT
-      THCudaBlas_Sgemm(
-      #elif defined(THC_REAL_IS_HALF)
-      THCudaBlas_Hgemm(
-      #elif defined(THC_REAL_IS_DOUBLE)
-      THCudaBlas_Dgemm(
-      #endif
-          state,
-          'n', 'n',
-          n, m, k,
-          ScalarConvert<int, real>::to(1),
-          THCTensor_(data)(state, columns), n,
-          THCTensor_(data)(state, weight_i), k,
-          ScalarConvert<int, real>::to(1),
-          THCTensor_(data)(state, output_i), n
-      );
-    }
+  // Do bias first (fill the output)
+  if (bias) {
+    // fillOutputTransposedWithBias
+    fillOutputWithBias
+      <<<GET_BLOCKS(batchSize*nInputPlane*nOutputPlane*outputHeight*outputWidth), 
+      CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>> (
+        THCTensor_(data)(state, output), batchSize, outputHeight*outputWidth,
+        THCTensor_(data)(state, bias), nInputPlane, nOutputPlane);
+    THCudaCheck(cudaGetLastError());
+  } else {
+    THCTensor_(zero)(state, output);
   }
 
-  // Free
-  THCTensor_(free)(state, input_n);
-  THCTensor_(free)(state, output_n);
+  updateOutputKernel
+    <<<GET_BLOCKS(THCTensor_(nElement)(state, output)), 
+    CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      THCTensor_(data)(state, output), THCTensor_(data)(state, input),
+      THCTensor_(data)(state, weight), batchSize,
+      inputHeight, inputWidth, nInputPlane, nOutputPlane, 
+      outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, 1, 1);
+  THCudaCheck(cudaGetLastError());
 
-  THCTensor_(free)(state, input_i);
-  THCTensor_(free)(state, output_i);
+  // transpose back
+  transposeWithBuffer(state, weight, columns->storage, 0, 1);
 
+  // Free
+  THCTensor_(free)(state, outputTransposed_i);
   THCTensor_(free)(state, weight_i);
 
-  THCTensor_(free)(state, weight);
-  THCTensor_(free)(state, _weight);
-
-  THCTensor_(free)(state, bias_i);
-  THCTensor_(free)(state, bias);
-  THCTensor_(free)(state, _bias);
-  // Transpose output
+  // Merge first dims of the output
   THCTensor_(resize4d)(state, output, batchSize, nInputPlane * nOutputPlane, outputHeight, outputWidth);
 
   // Make a contiguous copy of output (OPTIONAL)
@@ -260,6 +272,75 @@ void THNN_(SpatialDepthWiseConvolution_updateOutput)(
   THCTensor_(free)(state, input);
 }
 
+__global__ void updateGradInputKernel(
+      real *gradInput       , const real *gradOutput,
+      const real *weight    , const int batchSize   ,
+      const int inputHeight , const int inputWidth  ,
+      const int nInputPlane , const int nOutputPlane,
+      const int outputHeight, const int outputWidth ,
+      const int kH          , const int kW          ,
+      const int padH        , const int padW        ,
+      const int strideH     , const int strideW     ,
+      const int dilationH   , const int dilationW) {
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < batchSize * nInputPlane * inputHeight * inputWidth) {
+
+    const int gradInputIdx = index;
+    // grid is (batchSize) x (nInputPlane) x (inputHeight) x (inputWidth)
+    const int wIn         = index % inputWidth  + padW; index /= inputWidth ;
+    const int hIn         = index % inputHeight + padH; index /= inputHeight;
+    const int inPlaneIdx  = index % nInputPlane       ; index /= nInputPlane;
+    // `index` is now the index of the sample in batch
+    const int & sampleIdx = index;
+
+    // gradOutput is (batchSize) x (nInputPlane) x (nOutputPlane) x (outputHeight) x (outputWidth)
+    gradOutput += (((sampleIdx 
+      * nInputPlane + inPlaneIdx)
+        * nOutputPlane)
+          * outputHeight)
+            * outputWidth;
+
+    // weight is (nInputPlane) x (nOutputPlane) x (kH) x (kW)
+    weight += inPlaneIdx * nOutputPlane * kH * kW;
+
+    accreal result = 0;
+
+    const int kernelExtentW = (kW - 1) * dilationW + 1;
+    const int kernelExtentH = (kH - 1) * dilationH + 1;
+    // compute the start and end of the output
+    const int wOutStart =
+      (wIn < kernelExtentW) ? 0 : (wIn - kernelExtentW) / strideW + 1;
+    const int wOutEnd = min(wIn / strideW + 1, outputHeight);
+    const int hOutStart =
+      (hIn < kernelExtentH) ? 0 : (hIn - kernelExtentH) / strideH + 1;
+    const int hOutEnd = min(hIn / strideH + 1, outputHeight);
+
+    for (int outPlaneIdx = 0; outPlaneIdx < nOutputPlane; ++outPlaneIdx) {
+      for (int hOut = hOutStart; hOut < hOutEnd; ++hOut) {
+        for (int wOut = wOutStart; wOut < wOutEnd; ++wOut) {
+          int hWeight = (hIn - hOut * strideH);
+          int wWeight = (wIn - wOut * strideW);
+          
+          // TODO: use LCM of stride and dilation to avoid unnecessary loops
+          if (hWeight % dilationH == 0 && wWeight % dilationW == 0) {
+            hWeight /= dilationH;
+            wWeight /= dilationW;  
+
+            result += 
+              gradOutput[hOut * outputWidth + wOut] *
+              weight[hWeight * kW + wWeight];
+          }
+        }
+      }
+      gradOutput += outputHeight * outputWidth;
+    }
+
+    gradInput[gradInputIdx] = ScalarConvert<accreal, real>::to(result);
+  }
+}
+
 void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
            THCState *state,
            THCTensor *input,
@@ -299,22 +380,8 @@ void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
   THNN_(SpatialDepthWiseConvolution_shapeCheck)
        (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
 
-  // Transpose weight
-  THCTensor *_weight = THCTensor_(newTranspose)(state, weight, 0, 1);
-  weight = THCTensor_(newContiguous)(state, _weight);
-
-  // resize weight
-  long s1 = weight->size[0];
-  long s2 = weight->size[1];
-  long s3 = weight->size[2] * weight->size[3];
-  weight = THCTensor_(newWithStorage3d)(state, weight->storage, weight->storageOffset,
-          s1, -1, s2, -1, s3, -1);
-
-
-
   input = THCTensor_(newContiguous)(state, input);
 
-
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -334,73 +401,14 @@ void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
   // Resize output
   THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
 
-  // Resize temporary columns
-  THCTensor_(resize2d)(state, gradColumns, 1*kW*kH, outputHeight*outputWidth);
-
-  // Helpers
-  THCTensor *gradInput_n = THCTensor_(new)(state);
-  THCTensor *gradOutput_n = THCTensor_(new)(state);
-
-  // Helpers for DepthWiseConvolution
-  THCTensor *gradOutput_i = THCTensor_(new)(state);
-  THCTensor *gradInput_i = THCTensor_(new)(state);
-  THCTensor *weight_i = THCTensor_(new)(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per sample:
-    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
-    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
-
-    for (int ipelt = 0; ipelt < nInputPlane; ipelt++)
-      {
-      // M,N,K are dims of matrix A and B
-      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-
-      // Fetch ipelt-th input plane
-      THCTensor_(narrow)(state, gradInput_i, gradInput_n, 0, ipelt, 1);
-      THCTensor_(select)(state, gradOutput_i, gradOutput_n, 0, ipelt);
-      THCTensor_(select)(state, weight_i, weight, 0, ipelt);
-
-      long m = 1*kW*kH;
-      long n = gradColumns->size[1];
-      long k = nOutputPlane;
-
-      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-      #ifdef THC_REAL_IS_FLOAT
-      THCudaBlas_Sgemm(
-      #elif defined(THC_REAL_IS_HALF)
-      THCudaBlas_Hgemm(
-      #elif defined(THC_REAL_IS_DOUBLE)
-      THCudaBlas_Dgemm(
-      #endif
-          state,
-          'n', 't',
-          n, m, k,
-          ScalarConvert<int, real>::to(1),
-          THCTensor_(data)(state, gradOutput_i), n,
-          THCTensor_(data)(state, weight_i), m,
-          ScalarConvert<int, real>::to(0),
-          THCTensor_(data)(state, gradColumns), n
-      );
-
-      // Unpack columns back into input:
-      col2im<real, accreal>(
-        THCState_getCurrentStream(state),
-        THCTensor_(data)(state, gradColumns),
-        1, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
-        1, 1, THCTensor_(data)(state, gradInput_i)
-      );
-      }
-  }
-
-  // Free
-  THCTensor_(free)(state, gradInput_n);
-  THCTensor_(free)(state, gradOutput_n);
-
-  THCTensor_(free)(state, gradInput_i);
-  THCTensor_(free)(state, gradOutput_i);
-  THCTensor_(free)(state, weight_i);
+  updateGradInputKernel
+    <<<GET_BLOCKS(THCTensor_(nElement)(state, gradInput)), 
+    CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      THCTensor_(data)(state, gradInput), THCTensor_(data)(state, gradOutput),
+      THCTensor_(data)(state, weight), batchSize,
+      inputHeight, inputWidth, nInputPlane, nOutputPlane, 
+      outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, 1, 1);
+  THCudaCheck(cudaGetLastError());
 
   // Resize output
   if (batch == 0) {
@@ -411,8 +419,6 @@ void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
 
   THCTensor_(free)(state, input);
   THCTensor_(free)(state, gradOutput);
-  THCTensor_(free)(state, weight);
-  THCTensor_(free)(state, _weight);
 }
 
 void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
@@ -432,9 +438,27 @@ void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
 
   THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones);
   if (gradBias) {
-   THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
+    THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
   }
 
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
   // Params
   int nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kW*kH) : gradWeight->size[1];
   int nOutputPlane = gradWeight->size[0];
@@ -442,7 +466,7 @@ void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
     THCTensor_(resize4d)(state, gradWeight, nOutputPlane, nInputPlane, kH, kW);
   }
 
- gradOutput = THCTensor_(newWithTensor)(state, gradOutput);
+  gradOutput = THCTensor_(newWithTensor)(state, gradOutput);
   if (input->nDimension == 3) {
     if (gradOutput->nDimension == 3) {
       THCTensor_(resize4d)(state, gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
@@ -455,53 +479,12 @@ void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
     }
   }
 
-
   THNN_(SpatialDepthWiseConvolution_shapeCheck)
        (state, input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
 
-  // Transpose gradWeight & gradBias
-  THCTensor_(transpose)(state, gradWeight, NULL, 0, 1);
-
+  // Do `gradBias` first:
 
-  THCTensor *_gradBias = NULL;
-  if(gradBias) {
-    THCTensor_(transpose)(state, gradBias, NULL, 0, 1);
-    _gradBias = gradBias;
-    gradBias = THCTensor_(newContiguous)(state, gradBias);
-
-  }
-
-  THCTensor *_gradWeight;
-
-  _gradWeight = gradWeight;
-
-  gradWeight = THCTensor_(newContiguous)(state, gradWeight);
-
-
-  // resize gradWeight
-  long s1 = gradWeight->size[0];
-  long s2 = gradWeight->size[1];
-  long s3 = gradWeight->size[2] * gradWeight->size[3];
-  gradWeight = THCTensor_(newWithStorage3d)(state, gradWeight->storage, gradWeight->storageOffset,
-          s1, -1, s2, -1, s3, -1);
-
-  input = THCTensor_(newContiguous)(state, input);
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    // Force batch
-    batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
+  transposeWithBuffer(state, gradBias, columns->storage, 0, 1);
 
   // Define a buffer of ones, for bias accumulation
   if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
@@ -510,134 +493,138 @@ void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
   }
 
-  // Resize temporary columns
-  THCTensor_(resize2d)(state, columns, 1*kW*kH, outputHeight*outputWidth);
-
   // Helpers
   THCTensor *input_n = THCTensor_(new)(state);
   THCTensor *gradOutput_n = THCTensor_(new)(state);
 
-  // Helpers for DepthWiseConvolution
-  THCTensor *gradOutput_i = THCTensor_(new)(state);
-  THCTensor *input_i = THCTensor_(new)(state);
-  THCTensor *gradWeight_i = THCTensor_(new)(state);
-
-  THCTensor *gradBias_i = NULL;
-  if(gradBias) {
-    gradBias_i = THCTensor_(new)(state);
-  }
-
   // For each elt in batch, do:
   for (int elt = 0; elt < batchSize; elt ++) {
     // Matrix mulitply per output:
     THCTensor_(select)(state, input_n, input, 0, elt);
     THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
 
-    for (int ipelt = 0; ipelt < nInputPlane; ipelt++)
-    {
-      THCTensor_(narrow)(state, input_i, input_n, 0, ipelt, 1);
-      THCTensor_(select)(state, gradOutput_i, gradOutput_n, 0, ipelt);
-      THCTensor_(select)(state, gradWeight_i, gradWeight, 0, ipelt);
-      if (gradBias) {
-        THCTensor_(select)(state, gradBias_i, gradBias, 0, ipelt);
-      }
-
-      // Extract columns:
-      im2col(
-        THCState_getCurrentStream(state),
-        THCTensor_(data)(state, input_i),
-        1, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-        1, 1, THCTensor_(data)(state, columns)
-      );
-
-      // M,N,K are dims of matrix A and B
-      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-      long m = nOutputPlane;
-      long n = 1*kW*kH;
-      long k = columns->size[1];
+    // Do Bias:
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nInputPlane * nOutputPlane;
+    long k_ = outputHeight * outputWidth;
 
-      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
       #ifdef THC_REAL_IS_FLOAT
-      THCudaBlas_Sgemm(
-      #elif defined(THC_REAL_IS_HALF)
-      THCudaBlas_Hgemm(
+      THCudaBlas_Sgemv(
       #elif defined(THC_REAL_IS_DOUBLE)
-      THCudaBlas_Dgemm(
+      THCudaBlas_Dgemv(
+      #endif
+          state,
+          't',
+          k_, m_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), 1,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), 1
+      );
       #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
           state,
           't', 'n',
-          n, m, k,
+          m_, 1, k_,
           scale,
-          THCTensor_(data)(state, columns), k,
-          THCTensor_(data)(state, gradOutput_i), k,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), k_,
           ScalarConvert<int, real>::to(1),
-          THCTensor_(data)(state, gradWeight_i), n
+          THCTensor_(data)(state, gradBias), m_
       );
-
-      // Do Bias:
-      // M,N,K are dims of matrix A and B
-      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-      long m_ = nOutputPlane;
-      long k_ = outputHeight * outputWidth;
-
-      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-      if (gradBias) {
-        #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-        #ifdef THC_REAL_IS_FLOAT
-        THCudaBlas_Sgemv(
-        #elif defined(THC_REAL_IS_DOUBLE)
-        THCudaBlas_Dgemv(
-        #endif
-            state,
-            't',
-            k_, m_,
-            scale,
-            THCTensor_(data)(state, gradOutput_i), k_,
-            THCTensor_(data)(state, ones), 1,
-            ScalarConvert<int, real>::to(1),
-            THCTensor_(data)(state, gradBias_i), 1
-        );
-        #endif
-        #ifdef THC_REAL_IS_HALF
-        THCudaBlas_Hgemm(
-            state,
-            't', 'n',
-            m_, 1, k_,
-            scale,
-            THCTensor_(data)(state, gradOutput_i), k_,
-            THCTensor_(data)(state, ones), k_,
-            ScalarConvert<int, real>::to(1),
-            THCTensor_(data)(state, gradBias_i), m_
-        );
-        #endif
-      }
+      #endif
     }
   }
 
+  transposeWithBuffer(state, gradBias, columns->storage, 0, 1);
+
+  // OK, bias done, now let's do `gradWeight`:
 
-  // Copy back and transpose back
-  THCTensor_(transpose)(state, _gradWeight, NULL, 0, 1);
-  THCTensor_(resize4d)(state, _gradWeight, nInputPlane, nOutputPlane, kH, kW);
-  THCTensor_(copy)(state, _gradWeight, gradWeight);
-  THCTensor_(transpose)(state, _gradWeight, NULL, 0, 1);
+  // merge two last (spatial) dimensions
+  THCTensor_(setStorage3d)(state, gradWeight, 
+    gradWeight->storage, gradWeight->storageOffset,
+    nOutputPlane, -1, nInputPlane, -1, kH*kW, -1);
 
-  if(gradBias) {
-    THCTensor_(transpose)(state, _gradBias, NULL, 0, 1);
-    THCTensor_(resize2d)(state, _gradBias, nInputPlane, nOutputPlane);
-    THCTensor_(copy)(state, _gradBias, gradBias);
-    THCTensor_(transpose)(state, _gradBias, NULL, 0, 1);
+  transposeWithBuffer(state, gradWeight, columns->storage, 0, 1);
+  // transpose for proper accumulation in GEMM
+  transposeWithBuffer(state, gradWeight, columns->storage, 1, 2);
+
+  // Resize temporary columns
+  THCTensor_(resize3d)(state, columns, kW*kH, batchSize, outputHeight*outputWidth);
+
+  THCTensor_(resize3d)(state, ones, nOutputPlane, batchSize, outputHeight*outputWidth);
+  THCTensor *gradOutputGrouped = ones;
+
+  THCTensor *gradWeight_i = THCTensor_(new)(state);
+  THCTensor *gradOutput_i = THCTensor_(new)(state);
+
+  for (int inPlaneIdx = 0; inPlaneIdx < nInputPlane; ++inPlaneIdx) {
+
+    // group gradOutput planes by input plane index and transpose
+    // `gradOutputGrouped` has size (nOutputPlane) x (batchSize) x (outputHeight) x (outputWidth)
+    THCTensor_(select)(state, gradOutput_i, gradOutput, 1, inPlaneIdx);
+    THCTensor_(transpose)(state, gradOutput_i, gradOutput_i, 0, 1);
+    THCTensor_(copy)(state, gradOutputGrouped, gradOutput_i);
+
+    // columns: (kW*kH) x (batchSize) x (outputHeight*outputWidth)
+    // gradOutputTransposed: (nInputPlane) x (nOutputPlane) x (batchSize) x (outputHeight*outputWidth)
+    // gradWeight: (nInputPlane) x (nOutputPlane) x (kH*kW)
+    THCTensor_(select)(state, gradWeight_i, gradWeight, 0, inPlaneIdx);
+    
+    // Extract columns:
+    im2col_depthwise(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input), batchSize, nInputPlane,
+      inPlaneIdx, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = nOutputPlane;
+    long n = kH * kW;
+    long k = batchSize * outputHeight * outputWidth;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    // (m x k) * (n x k)^T = (m x n)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        't', 'n',
+        m, n, k,
+        scale,
+        THCTensor_(data)(state, gradOutputGrouped), k,
+        THCTensor_(data)(state, columns), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradWeight_i), m
+    );
   }
 
+  // transpose back
+  transposeWithBuffer(state, gradWeight, columns->storage, 1, 2);
+  transposeWithBuffer(state, gradWeight, columns->storage, 0, 1);
+
+  // un-merge two last (spatial) dimensions back
+  THCTensor_(setStorage4d)(state, gradWeight,
+    gradWeight->storage, gradWeight->storageOffset,
+    nOutputPlane, -1, nInputPlane, -1, kH, -1, kW, -1);
 
   // Free
   THCTensor_(free)(state, input_n);
   THCTensor_(free)(state, gradOutput_n);
-  THCTensor_(free)(state, input_i);
-  THCTensor_(free)(state, gradOutput_i);
+
   THCTensor_(free)(state, gradWeight_i);
-  THCTensor_(free)(state, gradWeight);
-  THCTensor_(free)(state, gradBias_i);
-  THCTensor_(free)(state, gradBias);
+  THCTensor_(free)(state, gradOutput_i);
 
   // Resize
   if (batch == 0) {
diff --git a/lib/THCUNN/im2col.h b/lib/THCUNN/im2col.h
index 060525f5..b5af9eef 100644
--- a/lib/THCUNN/im2col.h
+++ b/lib/THCUNN/im2col.h
@@ -7,14 +7,15 @@
 // Kernel for fast unfold+copy
 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
 template <typename Dtype>
-__global__ void im2col_kernel(const int n, const Dtype* data_im,
-                              const int height, const int width,
-                              const int ksize_h, const int ksize_w,
-                              const int pad_h, const int pad_w,
-                              const int stride_h, const int stride_w,
-                              const int dilation_h, const int dilation_w,
-                              const int height_col, const int width_col,
-    Dtype* data_col) {
+__global__ void im2col_kernel(
+      const int n, const Dtype* data_im,
+      const int height, const int width,
+      const int ksize_h, const int ksize_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      const int height_col, const int width_col,
+      Dtype* data_col) {
   CUDA_KERNEL_LOOP(index, n) {
     int w_out = index % width_col;
     index /= width_col;
@@ -38,11 +39,12 @@ __global__ void im2col_kernel(const int n, const Dtype* data_im,
 }
 
 template <typename Dtype>
-void im2col(cudaStream_t stream, const Dtype* data_im, const int channels,
-            const int height, const int width,
-            const int ksize_h, const int ksize_w, const int pad_h,
-            const int pad_w, const int stride_h, const int stride_w,
-            const int dilation_h, const int dilation_w, Dtype* data_col) {
+void im2col(
+      cudaStream_t stream, const Dtype* data_im, const int channels,
+      const int height, const int width,
+      const int ksize_h, const int ksize_w, const int pad_h,
+      const int pad_w, const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w, Dtype* data_col) {
   // We are going to launch channels * height_col * width_col kernels, each
   // kernel responsible for copying a single-channel grid.
   int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1))
@@ -60,15 +62,87 @@ void im2col(cudaStream_t stream, const Dtype* data_im, const int channels,
   THCudaCheck(cudaGetLastError());
 }
 
+// im2col version for SpatialDepthWiseConvolution.
+// Similar to im2col, but the output matrix is `batch_size` blocks of
+// size (kW*kH) x (h_out*w_out) concatenated over the SECOND dimension,
+// where the i-th block is the result of `im2col` of the `inPlaneIdx`-th
+// plane of the i-th sample in the batch.
+// `data_col` must be able to accomodate (kW*kH) x (batch_size*h_out*w_out) elements.
+template <typename Dtype>
+__global__ void im2col_depthwise_kernel(
+      const int n, const Dtype* data_im,
+      const int batch_size,
+      const int nInputPlane, const int inPlaneIdx,
+      const int height, const int width,
+      const int ksize_h, const int ksize_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      const int height_col, const int width_col,
+      Dtype* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    index /= height_col; // `index` is now the index of this sample in the input batch
+    int channel_in = index * nInputPlane + inPlaneIdx;
+    int h_in = h_out * stride_h - pad_h;
+    int w_in = w_out * stride_w - pad_w;
+
+    data_col += index * height_col * width_col + width_col * h_out + w_out;
+    data_im += (channel_in * height + h_in) * width + w_in;
+    
+    for (int i = 0; i < ksize_h; ++i) {
+      for (int j = 0; j < ksize_w; ++j) {
+        int h = h_in + i * dilation_h;
+        int w = w_in + j * dilation_w;
+        *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
+          data_im[i * dilation_h * width + j * dilation_w] : ScalarConvert<int, Dtype>::to(0);
+        data_col += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void im2col_depthwise(
+      cudaStream_t stream, const Dtype* data_im,
+      const int batch_size, const int nInputPlane /* per one sample */, 
+      const int inPlaneIdx,
+      const int height, const int width,
+      const int ksize_h, const int ksize_w, const int pad_h,
+      const int pad_w, const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w, Dtype* data_col) {
+
+  // We are going to launch batch_size * height_col * width_col kernels, 
+  // each kernel responsible for copying a single-channel grid.
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1))
+                   / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1))
+                  / stride_w + 1;
+  int num_kernels = batch_size * height_col * width_col;
+  // Launch
+  im2col_depthwise_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
+      num_kernels, data_im, batch_size, 
+      nInputPlane, inPlaneIdx, height, width,
+      ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+      dilation_h, dilation_w,
+      height_col, width_col, data_col
+  );
+  THCudaCheck(cudaGetLastError());
+}
+
 template <typename Dtype, typename Acctype>
-__global__ void col2im_kernel(const int n, const Dtype* data_col,
-                                  const int height, const int width, const int channels,
-                                  const int kernel_h, const int kernel_w,
-                                  const int pad_h, const int pad_w,
-                                  const int stride_h, const int stride_w,
-                                  const int dilation_h, const int dilation_w,
-                                  const int height_col, const int width_col,
-                                  Dtype* data_im) {
+__global__ void col2im_kernel(
+      const int n, const Dtype* data_col,
+      const int height, const int width, const int channels,
+      const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      const int height_col, const int width_col,
+      Dtype* data_im) {
+
   CUDA_KERNEL_LOOP(index, n) {
     Acctype val = Acctype(0);
     const int w_im = index % width + pad_w;

From 260017f8ce356d7f4911cc8bee9758d8e1f79098 Mon Sep 17 00:00:00 2001
From: Egor Burkov <shrubb@ya.ru>
Date: Sat, 5 Aug 2017 17:25:08 +0300
Subject: [PATCH 2/2] SpatialDepthWiseConvolution: specialize dilation values
 and fix an updateGradInput bug

---
 .../generic/SpatialDepthWiseConvolution.cu    | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/lib/THCUNN/generic/SpatialDepthWiseConvolution.cu b/lib/THCUNN/generic/SpatialDepthWiseConvolution.cu
index d324b00c..0bdeafe7 100644
--- a/lib/THCUNN/generic/SpatialDepthWiseConvolution.cu
+++ b/lib/THCUNN/generic/SpatialDepthWiseConvolution.cu
@@ -111,6 +111,7 @@ static inline void THNN_(SpatialDepthWiseConvolution_shapeCheck)(
   }
 }
 
+template <int dilationH, int dilationW>
 __global__ void updateOutputKernel(
       real *output          , const real *input     , 
       const real *weight    , const int batchSize   ,
@@ -119,8 +120,7 @@ __global__ void updateOutputKernel(
       const int outputHeight, const int outputWidth ,
       const int kH          , const int kW          ,
       const int padH        , const int padW        ,
-      const int strideH     , const int strideW     ,
-      const int dilationH   , const int dilationW) {
+      const int strideH     , const int strideW) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -236,13 +236,13 @@ void THNN_(SpatialDepthWiseConvolution_updateOutput)(
     THCTensor_(zero)(state, output);
   }
 
-  updateOutputKernel
+  updateOutputKernel <1,1>
     <<<GET_BLOCKS(THCTensor_(nElement)(state, output)), 
     CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
       THCTensor_(data)(state, output), THCTensor_(data)(state, input),
       THCTensor_(data)(state, weight), batchSize,
       inputHeight, inputWidth, nInputPlane, nOutputPlane, 
-      outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, 1, 1);
+      outputHeight, outputWidth, kH, kW, padH, padW, dH, dW);
   THCudaCheck(cudaGetLastError());
 
   // transpose back
@@ -272,6 +272,7 @@ void THNN_(SpatialDepthWiseConvolution_updateOutput)(
   THCTensor_(free)(state, input);
 }
 
+template <int dilationH, int dilationW>
 __global__ void updateGradInputKernel(
       real *gradInput       , const real *gradOutput,
       const real *weight    , const int batchSize   ,
@@ -280,8 +281,7 @@ __global__ void updateGradInputKernel(
       const int outputHeight, const int outputWidth ,
       const int kH          , const int kW          ,
       const int padH        , const int padW        ,
-      const int strideH     , const int strideW     ,
-      const int dilationH   , const int dilationW) {
+      const int strideH     , const int strideW) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -335,6 +335,7 @@ __global__ void updateGradInputKernel(
         }
       }
       gradOutput += outputHeight * outputWidth;
+      weight += kH * kW;
     }
 
     gradInput[gradInputIdx] = ScalarConvert<accreal, real>::to(result);
@@ -401,15 +402,19 @@ void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
   // Resize output
   THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
 
-  updateGradInputKernel
+  transposeWithBuffer(state, weight, gradColumns->storage, 0, 1);
+
+  updateGradInputKernel <1,1>
     <<<GET_BLOCKS(THCTensor_(nElement)(state, gradInput)), 
     CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
       THCTensor_(data)(state, gradInput), THCTensor_(data)(state, gradOutput),
       THCTensor_(data)(state, weight), batchSize,
       inputHeight, inputWidth, nInputPlane, nOutputPlane, 
-      outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, 1, 1);
+      outputHeight, outputWidth, kH, kW, padH, padW, dH, dW);
   THCudaCheck(cudaGetLastError());
 
+  transposeWithBuffer(state, weight, gradColumns->storage, 0, 1);
+
   // Resize output
   if (batch == 0) {
     THCTensor_(select)(state, gradOutput, NULL, 0, 0);