diff --git a/README.md b/README.md index 75f0e49..d53387f 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,8 @@ cuttResult cuttDestroy(cuttHandle handle); // handle = Returned handle to cuTT plan // idata = Input data size product(dim) // odata = Output data size product(dim) +// alpha = scaling-factor for input +// beta = scaling-factor for output // // Returns // Success/unsuccess code diff --git a/src/cutt.cpp b/src/cutt.cpp index 1aace3a..c7ae530 100755 --- a/src/cutt.cpp +++ b/src/cutt.cpp @@ -184,7 +184,7 @@ cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, si } cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType, - cudaStream_t stream, void* idata, void* odata) { + cudaStream_t stream, void* idata, void* odata, void* alpha, void *beta) { // Check that input parameters are valid cuttResult inpCheck = cuttPlanCheckInput(rank, dim, permutation, sizeofType); @@ -245,7 +245,7 @@ cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutat cudaCheck(cudaDeviceSynchronize()); timer.start(); // Execute plan - if (!cuttKernel(*it, idata, odata)) return CUTT_INTERNAL_ERROR; + if (!cuttKernel(*it, idata, odata, alpha, beta)) return CUTT_INTERNAL_ERROR; timer.stop(); double curTime = timer.seconds(); // it->print(); @@ -293,7 +293,7 @@ cuttResult cuttDestroy(cuttHandle handle) { return CUTT_SUCCESS; } -cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata) { +cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata, void* alpha, void* beta) { auto it = planStorage.find(handle); if (it == planStorage.end()) return CUTT_INVALID_PLAN; @@ -305,6 +305,6 @@ cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata) { cudaCheck(cudaGetDevice(&deviceID)); if (deviceID != plan.deviceID) return CUTT_INVALID_DEVICE; - if (!cuttKernel(plan, idata, odata)) return CUTT_INTERNAL_ERROR; + if (!cuttKernel(plan, idata, odata, alpha, beta)) return CUTT_INTERNAL_ERROR; return CUTT_SUCCESS; } diff --git a/src/cutt.h b/src/cutt.h index d1d2827..8dd5e02 100644 --- a/src/cutt.h +++ b/src/cutt.h @@ -74,7 +74,7 @@ cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, si // Success/unsuccess code // cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType, - cudaStream_t stream, void* idata, void* odata); + cudaStream_t stream, void* idata, void* odata, void* alpha = NULL, void* beta = NULL); // // Destroy plan @@ -88,16 +88,18 @@ cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutat cuttResult cuttDestroy(cuttHandle handle); // -// Execute plan out-of-place +// Execute plan out-of-place; performs a tensor transposition of the form \f[ \mathcal{B}_{\pi(i_0,i_1,...,i_{d-1})} \gets \alpha * \mathcal{A}_{i_0,i_1,...,i_{d-1}} + \beta * \mathcal{B}_{\pi(i_0,i_1,...,i_{d-1})}, \f] // // Parameters // handle = Returned handle to cuTT plan // idata = Input data size product(dim) // odata = Output data size product(dim) +// alpha = scalar for input +// beta = scalar for output // // Returns // Success/unsuccess code // -cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata); +cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata, void* alpha = NULL, void* beta = NULL); #endif // CUTT_H diff --git a/src/cuttkernel.cu b/src/cuttkernel.cu index 478c851..e3fc8b6 100755 --- a/src/cuttkernel.cu +++ b/src/cuttkernel.cu @@ -35,12 +35,14 @@ SOFTWARE. // dim3 numthread(TILEDIM, TILEROWS, 1); // dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMk-1)/TILEDIM+1), 1, plan.volMbar); // -template +template __global__ void transposeTiled( const int numMm, const int volMbar, const int sizeMbar, const int2 tiledVol, const int cuDimMk, const int cuDimMm, const TensorConvInOut* RESTRICT glMbar, - const T* RESTRICT dataIn, T* RESTRICT dataOut) { + const T* RESTRICT dataIn, T* RESTRICT dataOut, + const T alpha, + const T beta) { // Shared memory __shared__ T shTile[TILEDIM][TILEDIM+1]; @@ -108,7 +110,10 @@ __global__ void transposeTiled( // int pos = posOut + j*cuDimMm; // if (xout + j < readVol.x && yout < readVol.y) { if ((maskOutx & (1 << j)) != 0 ) { - dataOut[posOut] = shTile[threadIdx.x][threadIdx.y + j]; + if( betaIsZero ) + dataOut[posOut] = alpha * shTile[threadIdx.x][threadIdx.y + j]; + else + dataOut[posOut] = alpha * shTile[threadIdx.x][threadIdx.y + j] + beta * dataOut[posOut]; } posOut += posOutAdd; } @@ -120,14 +125,16 @@ __global__ void transposeTiled( // // Packed transpose. Thread block loads plan.volMmk number of elements // -template +template __global__ void transposePacked( const int volMmk, const int volMbar, const int sizeMmk, const int sizeMbar, const TensorConvInOut* RESTRICT gl_Mmk, const TensorConvInOut* RESTRICT gl_Mbar, const TensorConv* RESTRICT gl_Msh, - const T* RESTRICT dataIn, T* RESTRICT dataOut) { + const T* RESTRICT dataIn, T* RESTRICT dataOut, + const T alpha, + const T beta) { // Shared memory. volMmk elements extern __shared__ char shBuffer_char[]; @@ -213,7 +220,11 @@ __global__ void transposePacked( for (int j=0;j < numRegStorage;j++) { int posMmk = threadIdx.x + j*blockDim.x; int posOut = posMbarOut + posMmkOut[j]; - if (posMmk < volMmk) dataOut[posOut] = shBuffer[posSh[j]]; + if (posMmk < volMmk) + if( betaIsZero ) + dataOut[posOut] = alpha * shBuffer[posSh[j]]; + else + dataOut[posOut] = alpha * shBuffer[posSh[j]] + beta * dataOut[posOut]; } @@ -227,7 +238,7 @@ __global__ void transposePacked( // dim nthread(((volMmkWithSplit - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize, 1, 1) // dim nblock(ts.numSplit, min(256, max(1, ts.volMbar)), 1) // -template +template __global__ void transposePackedSplit( const int splitDim, const int volMmkUnsplit, const int volMbar, const int sizeMmk, const int sizeMbar, @@ -235,7 +246,9 @@ __global__ void transposePackedSplit( const TensorConvInOut* RESTRICT glMmk, const TensorConvInOut* RESTRICT glMbar, const TensorConv* RESTRICT glMsh, - const T* RESTRICT dataIn, T* RESTRICT dataOut) { + const T* RESTRICT dataIn, T* RESTRICT dataOut, + const T alpha, + const T beta) { // Shared memory. max(volSplit)*volMmkUnsplit T elements extern __shared__ char shBuffer_char[]; @@ -339,7 +352,11 @@ __global__ void transposePackedSplit( for (int j=0;j < numRegStorage;j++) { int posMmk = threadIdx.x + j*blockDim.x; int posOut = posMbarOut + posMmkOut[j]; - if (posMmk < volMmkSplit) dataOut[posOut] = shBuffer[posSh[j]]; + if (posMmk < volMmkSplit) + if( betaIsZero ) + dataOut[posOut] = alpha * shBuffer[posSh[j]]; + else + dataOut[posOut] = alpha * shBuffer[posSh[j]] + beta * dataOut[posOut]; } } @@ -353,13 +370,15 @@ __global__ void transposePackedSplit( // dim3 numthread(TILEDIM, TILEROWS, 1); // dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMkBar-1)/TILEDIM+1), 1, plan.volMbar); // -template +template __global__ void transposeTiledCopy( const int numMm, const int volMbar, const int sizeMbar, const int cuDimMk, const int cuDimMm, const int2 tiledVol, const TensorConvInOut* RESTRICT gl_Mbar, - const T* RESTRICT dataIn, T* RESTRICT dataOut) { + const T* RESTRICT dataIn, T* RESTRICT dataOut, + const T alpha, + const T beta) { const int warpLane = threadIdx.x & (warpSize - 1); TensorConvInOut Mbar; @@ -416,7 +435,10 @@ __global__ void transposeTiledCopy( for (int j=0;j < TILEDIM;j += TILEROWS) { // if ((x < tiledVol.x) && (y + j < tiledVol.y)) { if ((mask & (1 << j)) != 0) { - dataOut[posOut] = val[j/TILEROWS]; + if(betaIsZero) + dataOut[posOut] = alpha * val[j/TILEROWS]; + else + dataOut[posOut] = alpha * val[j/TILEROWS] + beta * dataOut[posOut]; } posOut += posOutAdd; } @@ -522,27 +544,27 @@ __global__ void transposeTiledCopy( // Sets shared memory bank configuration for all kernels. Needs to be called once per device. // void cuttKernelSetSharedMemConfig() { -#define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePacked, cudaSharedMemBankSizeFourByte )) +#define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePacked, cudaSharedMemBankSizeFourByte )) #include "calls.h" #undef CALL -#define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePacked, cudaSharedMemBankSizeEightByte )) +#define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePacked, cudaSharedMemBankSizeEightByte )) #include "calls.h" #undef CALL -#define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePackedSplit, cudaSharedMemBankSizeFourByte )) +#define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePackedSplit, cudaSharedMemBankSizeFourByte )) #include "calls.h" #undef CALL -#define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePackedSplit, cudaSharedMemBankSizeEightByte )) +#define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePackedSplit, cudaSharedMemBankSizeEightByte )) #include "calls.h" #undef CALL - cudaCheck(cudaFuncSetSharedMemConfig(transposeTiled, cudaSharedMemBankSizeFourByte)); - cudaCheck(cudaFuncSetSharedMemConfig(transposeTiledCopy, cudaSharedMemBankSizeFourByte)); + cudaCheck(cudaFuncSetSharedMemConfig(transposeTiled, cudaSharedMemBankSizeFourByte)); + cudaCheck(cudaFuncSetSharedMemConfig(transposeTiledCopy, cudaSharedMemBankSizeFourByte)); - cudaCheck(cudaFuncSetSharedMemConfig(transposeTiled, cudaSharedMemBankSizeEightByte)); - cudaCheck(cudaFuncSetSharedMemConfig(transposeTiledCopy, cudaSharedMemBankSizeEightByte)); + cudaCheck(cudaFuncSetSharedMemConfig(transposeTiled, cudaSharedMemBankSizeEightByte)); + cudaCheck(cudaFuncSetSharedMemConfig(transposeTiledCopy, cudaSharedMemBankSizeEightByte)); } @@ -574,7 +596,7 @@ int getNumActiveBlock(const int method, const int sizeofType, const LaunchConfig { #define CALL0(TYPE, NREG) \ cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, \ - transposePacked, numthread, lc.shmemsize) + transposePacked, numthread, lc.shmemsize) switch(lc.numRegStorage) { #define CALL(ICASE) case ICASE: if (sizeofType == 4) CALL0(float, ICASE); if (sizeofType == 8) CALL0(double, ICASE); break #include "calls.h" @@ -610,7 +632,7 @@ int getNumActiveBlock(const int method, const int sizeofType, const LaunchConfig // key not found in cache, determine value and add it to cache #define CALL0(TYPE, NREG) \ cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, \ - transposePackedSplit, numthread, lc.shmemsize) + transposePackedSplit, numthread, lc.shmemsize) switch(lc.numRegStorage) { #define CALL(ICASE) case ICASE: if (sizeofType == 4) CALL0(float, ICASE); if (sizeofType == 8) CALL0(double, ICASE); break #include "calls.h" @@ -626,10 +648,10 @@ int getNumActiveBlock(const int method, const int sizeofType, const LaunchConfig { if (sizeofType == 4) { cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, - transposeTiled, numthread, lc.shmemsize); + transposeTiled, numthread, lc.shmemsize); } else { cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, - transposeTiled, numthread, lc.shmemsize); + transposeTiled, numthread, lc.shmemsize); } } break; @@ -638,10 +660,10 @@ int getNumActiveBlock(const int method, const int sizeofType, const LaunchConfig { if (sizeofType == 4) { cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, - transposeTiledCopy, numthread, lc.shmemsize); + transposeTiledCopy, numthread, lc.shmemsize); } else { cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, - transposeTiledCopy, numthread, lc.shmemsize); + transposeTiledCopy, numthread, lc.shmemsize); } } break; @@ -837,14 +859,30 @@ int cuttKernelLaunchConfiguration(const int sizeofType, const TensorSplit& ts, return numActiveBlockReturn; } -bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut) { +bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut, void* alphaPtr, + void* betaPtr) { LaunchConfig& lc = plan.launchConfig; TensorSplit& ts = plan.tensorSplit; + double alpha = 1; + double beta = 0; + if( alphaPtr && plan.sizeofType == 4 ) + alpha = *((float*)alphaPtr); + if( alphaPtr && plan.sizeofType == 8 ) + alpha = *((double*)alphaPtr); + if( betaPtr && plan.sizeofType == 4 ) + beta = *((float*)betaPtr); + if( betaPtr && plan.sizeofType == 8 ) + beta = *((double*)betaPtr); + switch(ts.method) { case Trivial: { + if( alpha != 1 || beta != 0 ){ + printf("cuTT ERROR: this case still has to be implemented\n"); + return false; + } cudaCheck(cudaMemcpyAsync(dataOut, dataIn, ts.volMmk*ts.volMbar*plan.sizeofType, cudaMemcpyDeviceToDevice, plan.stream)); } @@ -853,11 +891,22 @@ bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut) { case Packed: { switch(lc.numRegStorage) { -#define CALL0(TYPE, NREG) \ - transposePacked <<< lc.numblock, lc.numthread, lc.shmemsize, plan.stream >>> \ +#define CALL0(TYPE, NREG, betaIsZero) \ + transposePacked <<< lc.numblock, lc.numthread, lc.shmemsize, plan.stream >>> \ (ts.volMmk, ts.volMbar, ts.sizeMmk, ts.sizeMbar, \ - plan.Mmk, plan.Mbar, plan.Msh, (TYPE *)dataIn, (TYPE *)dataOut) -#define CALL(ICASE) case ICASE: if (plan.sizeofType == 4) CALL0(float, ICASE); if (plan.sizeofType == 8) CALL0(double, ICASE); break + plan.Mmk, plan.Mbar, plan.Msh, (TYPE *)dataIn, (TYPE *)dataOut, alpha, beta) +#define CALL(ICASE) case ICASE: \ + if (plan.sizeofType == 4) \ + if (beta == 0) \ + CALL0(float, ICASE, true); \ + else \ + CALL0(float, ICASE, false); \ + if (plan.sizeofType == 8) \ + if (beta == 0) \ + CALL0(double, ICASE, true); \ + else \ + CALL0(double, ICASE, false); \ + break #include "calls.h" default: printf("cuttKernel no template implemented for numRegStorage %d\n", lc.numRegStorage); @@ -872,11 +921,22 @@ bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut) { case PackedSplit: { switch(lc.numRegStorage) { -#define CALL0(TYPE, NREG) \ - transposePackedSplit <<< lc.numblock, lc.numthread, lc.shmemsize, plan.stream >>> \ +#define CALL0(TYPE, NREG, betaIsZero) \ + transposePackedSplit <<< lc.numblock, lc.numthread, lc.shmemsize, plan.stream >>> \ (ts.splitDim, ts.volMmkUnsplit, ts. volMbar, ts.sizeMmk, ts.sizeMbar, \ - plan.cuDimMm, plan.cuDimMk, plan.Mmk, plan.Mbar, plan.Msh, (TYPE *)dataIn, (TYPE *)dataOut) -#define CALL(ICASE) case ICASE: if (plan.sizeofType == 4) CALL0(float, ICASE); if (plan.sizeofType == 8) CALL0(double, ICASE); break + plan.cuDimMm, plan.cuDimMk, plan.Mmk, plan.Mbar, plan.Msh, (TYPE *)dataIn, (TYPE *)dataOut, alpha, beta) +#define CALL(ICASE) case ICASE: \ + if (plan.sizeofType == 4) \ + if (beta == 0) \ + CALL0(float, ICASE, true); \ + else \ + CALL0(float, ICASE, false); \ + if (plan.sizeofType == 8) \ + if (beta == 0) \ + CALL0(double, ICASE, true); \ + else \ + CALL0(double, ICASE, false); \ + break #include "calls.h" default: printf("cuttKernel no template implemented for numRegStorage %d\n", lc.numRegStorage); @@ -890,24 +950,40 @@ bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut) { case Tiled: { -#define CALL(TYPE) \ - transposeTiled <<< lc.numblock, lc.numthread, 0, plan.stream >>> \ +#define CALL(TYPE, betaIsZero) \ + transposeTiled <<< lc.numblock, lc.numthread, 0, plan.stream >>> \ (((ts.volMm - 1)/TILEDIM + 1), ts.volMbar, ts.sizeMbar, plan.tiledVol, plan.cuDimMk, plan.cuDimMm, \ - plan.Mbar, (TYPE *)dataIn, (TYPE *)dataOut) - if (plan.sizeofType == 4) CALL(float); - if (plan.sizeofType == 8) CALL(double); + plan.Mbar, (TYPE *)dataIn, (TYPE *)dataOut, alpha, beta) + if (plan.sizeofType == 4) + if(beta == 0) + CALL(float, true); + else + CALL(float, false); + if (plan.sizeofType == 8) + if(beta == 0) + CALL(double, true); + else + CALL(double, false); #undef CALL } break; case TiledCopy: { -#define CALL(TYPE) \ - transposeTiledCopy <<< lc.numblock, lc.numthread, 0, plan.stream >>> \ +#define CALL(TYPE, betaIsZero) \ + transposeTiledCopy <<< lc.numblock, lc.numthread, 0, plan.stream >>> \ (((ts.volMm - 1)/TILEDIM + 1), ts.volMbar, ts.sizeMbar, plan.cuDimMk, plan.cuDimMm, plan.tiledVol, \ - plan.Mbar, (TYPE *)dataIn, (TYPE *)dataOut) - if (plan.sizeofType == 4) CALL(float); - if (plan.sizeofType == 8) CALL(double); + plan.Mbar, (TYPE *)dataIn, (TYPE *)dataOut, alpha, beta) + if (plan.sizeofType == 4) + if(beta == 0) + CALL(float, true); + else + CALL(float, false); + if (plan.sizeofType == 8) + if(beta == 0) + CALL(double, true); + else + CALL(double, false); #undef CALL } break; diff --git a/src/cuttkernel.h b/src/cuttkernel.h index 33a06fd..17667a9 100755 --- a/src/cuttkernel.h +++ b/src/cuttkernel.h @@ -31,6 +31,6 @@ void cuttKernelSetSharedMemConfig(); int cuttKernelLaunchConfiguration(const int sizeofType, const TensorSplit& ts, const int deviceID, const cudaDeviceProp& prop, LaunchConfig& lc); -bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut); +bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut, void* alpha, void* beta); #endif // CUTTKERNEL_H