diff --git a/README.md b/README.md index 0e38ddb..23de218 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,23 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) -### (TODO: Your README) +* Eyad Almoamen + * [LinkedIn](https://www.linkedin.com/in/eyadalmoamen/), [personal website](https://eyadnabeel.com) +* Tested on: Windows 11, i7-10750H CPU @ 2.60GHz 2.59 GHz 16GB, RTX 2070 Super Max-Q Design 8GB (Personal Computer) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Introduction +====================== +I implemented exclusive scan on the CPU and on the GPU using both the naive and work-efficient methods. I've also implemented stream compaction + +Analysis +====================== +**Effect of Block Size on performance** +I ran the algorithms with variation in block size on arrays of size n = 2^14 elements, and the following graph shows the results: + +![](img/blocksize.png) + +There doesn't seem to be any sort of conclusive relation between blocksize and performance. +**Effect of number of elements on performance** +(I ran into a bug which rendered the algorithm incapable of running on arrays larger than 2^14, and therefore was not able to produce any meaningful results especially in comparison with the cpu) \ No newline at end of file diff --git a/img/blocksize.png b/img/blocksize.png new file mode 100644 index 0000000..f7d3167 Binary files /dev/null and b/img/blocksize.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..3c4d2ff 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 18; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,21 +64,21 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..4d369dd 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,5 @@ #include "common.h" +#include void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -24,6 +25,17 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + if (idata[idx] > 0) { + bools[idx] = 1; + } + else { + bools[idx] = 0; + } } /** @@ -33,6 +45,14 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + if (bools[idx] == 1) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..9d1cfca 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -10,6 +10,8 @@ #include #include +#define blockSize 512 + #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..372d212 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,13 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int sum = 0; + odata[0] = 0; + sum += idata[0]; + for (int i = 1; i < n; i++) { + odata[i] = sum; + sum += idata[i]; + } timer().endCpuTimer(); } @@ -31,8 +38,16 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count] = idata[i]; + count++; + } + } + timer().endCpuTimer(); - return -1; + return count; } /** @@ -43,8 +58,35 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int* bools = (int *)malloc(n * sizeof(int)); + int* indices = (int*)malloc(n * sizeof(int)); + + for (int i = 0; i < n; i++) { + bools[i] = (idata[i] != 0) ? 1 : 0; + } + int sum = 0; + indices[0] = 0; + sum += bools[0]; + for (int i = 1; i < n; i++) { + indices[i] = sum; + sum += bools[i]; + } + memcpy(odata, indices, n * sizeof(int)); + int count = indices[n - 1]; + + for (int i = 0; i < n; i++) { + if (bools[i] == 1) { + odata[indices[i]] = idata[i]; + } + } + + free(bools); + free(indices); + + + timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..e445911 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,9 @@ #include #include "common.h" #include "efficient.h" +#include +#include + namespace StreamCompaction { namespace Efficient { @@ -15,10 +18,93 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + + __global__ void kernUpsweepStep(int n, int destStride, int srcStride, int *data) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + int actualIdx = (idx + 1) * destStride - 1; + if (actualIdx >= n) { + return; + } + data[actualIdx] += data[actualIdx - srcStride]; + } + + __global__ void kernDownsweepStep(int n, int destStride, int srcStride, int* data) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + int actualIdx = (idx + 1) * destStride - 1; + if (actualIdx >= n) { + return; + } + int temp = data[actualIdx - srcStride]; + data[actualIdx - srcStride] = data[actualIdx]; + data[actualIdx] += temp; + } + + void scanWithoutTimer(int n, dim3 blocksPerGrid, int* dev_data) { + // TODO + + for (int d = 0; d <= ilog2ceil(n); d++) { + kernUpsweepStep << > > (n, std::pow(2, d + 1), std::pow(2, d), dev_data); + cudaDeviceSynchronize(); + } + + int zero = 0; + cudaMemcpy(dev_data + n - 1, &zero, sizeof(int), cudaMemcpyHostToDevice); + + for (int d = ilog2ceil(n); d >= 0; d--) { + kernDownsweepStep << > > (n, std::pow(2, d + 1), std::pow(2, d), dev_data); + cudaDeviceSynchronize(); + } + } + + int closestPower(int num) { + int i = 0; + while (num > std::pow(2, i)) { + i++; + } + return std::pow(2, i); + } + + int* zeros(int num) { + int *arr = (int*)malloc(num * sizeof(int)); + for (int i = 0; i < num; i++) { + arr[i] = 0; + } + return arr; + } + void scan(int n, int *odata, const int *idata) { + int nPot = closestPower(n); + + dim3 fullBlocksPerGrid((nPot + blockSize - 1) / blockSize); + + int* dev_data; + + cudaMalloc((void**)&dev_data, nPot * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_data"); + + cudaMemcpy(dev_data + nPot - n, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy idata ==> dev_data"); + + int* zero = zeros(n); + + cudaMemcpy(dev_data, zero, (nPot - n) * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy zero ==> dev_data"); + + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + + scanWithoutTimer(nPot, fullBlocksPerGrid, dev_data); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data + nPot - n, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Error during cudaMemcpy odata"); + + cudaFree(dev_data); + checkCUDAError("Error during cudaFree dev_data"); + + free(zero); } /** @@ -31,10 +117,71 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int nPot = closestPower(n); + + dim3 fullBlocksPerGrid((nPot + blockSize - 1) / blockSize); + + int* dev_idata, * dev_bools, * dev_indices, int* dev_odata; + + cudaMalloc((void**)&dev_idata, nPot * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_idata"); + + cudaMalloc((void**)&dev_bools, nPot * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_bools"); + + cudaMalloc((void**)&dev_indices, nPot * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_indices"); + + cudaMalloc((void**)&dev_odata, nPot * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_odata"); + + cudaMemcpy(dev_idata + nPot - n, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy dev_data"); + + int* zero = zeros(n); + + cudaMemcpy(dev_idata, zero, (nPot - n) * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy zero ==> dev_data"); + + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + //// TODO + // + + StreamCompaction::Common::kernMapToBoolean << > > (nPot, dev_bools, dev_idata); + + cudaMemcpy(dev_indices, dev_bools, nPot * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("Error during cudaMemcpy dev_data"); + + scanWithoutTimer(nPot, fullBlocksPerGrid, dev_indices); + StreamCompaction::Common::kernScatter << > > (n, dev_odata + nPot - n, dev_idata + nPot - n, dev_bools + nPot - n, dev_indices + nPot - n); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata + nPot - n, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Error during cudaMemcpy dev_odata"); + + int count = 0; + int lastbool = 0; + cudaMemcpy(&count, dev_indices + nPot - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastbool, dev_bools + nPot - 1, sizeof(int), cudaMemcpyDeviceToHost); + + count += lastbool; + + cudaFree(dev_odata); + checkCUDAError("Error during cudaFree dev_odata"); + + cudaFree(dev_indices); + checkCUDAError("Error during cudaFree dev_indices"); + + cudaFree(dev_bools); + checkCUDAError("Error during cudaFree dev_bools"); + + cudaFree(dev_idata); + checkCUDAError("Error during cudaFree dev_idata"); + + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..2ec0f1d 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,7 +1,11 @@ #include #include +#include #include "common.h" #include "naive.h" +#include +#include +#include namespace StreamCompaction { namespace Naive { @@ -12,14 +16,63 @@ namespace StreamCompaction { return timer; } // TODO: __global__ - + + __global__ void kernScanStep(int n, int stride, int* idata, int* odata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + odata[idx] = idata[idx]; + if (idx < stride) { + return; + } + odata[idx] += idata[idx - stride]; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int* dev_idata, * dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_idata"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_odata"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy dev_idata"); + + cudaDeviceSynchronize(); + // TODO + + timer().startGpuTimer(); + + for (int d = 0; d <= ilog2ceil(n); d++) { + kernScanStep << > > (n, std::pow(2, d), dev_idata, dev_odata); + cudaDeviceSynchronize(); + + cudaMemcpy(dev_idata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("Error during cudaMemcpy dev_odata ==> dev_idata"); + } + timer().endGpuTimer(); + + cudaMemcpy(odata + 1, dev_odata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Error during cudaMemcpy odata"); + + cudaFree(dev_idata); + checkCUDAError("Error during cudaFree dev_idata"); + cudaFree(dev_odata); + checkCUDAError("Error during cudaFree dev_odata"); + + + cudaDeviceSynchronize(); } + } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..0d34998 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,37 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int* dev_idata, * dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_idata"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("Error during cudaMalloc dev_odata"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error during cudaMemcpy idata --> dev_idata"); + + thrust::device_ptr dev_thrust_idata = thrust::device_ptr(dev_idata); + thrust::device_ptr dev_thrust_odata = thrust::device_ptr(dev_odata); + timer().startGpuTimer(); + + + thrust::exclusive_scan(dev_thrust_idata, dev_thrust_idata + n, dev_thrust_odata); + + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Error during cudaMemcpy dev_odata --> odata"); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } }