diff --git a/README.md b/README.md index 0e38ddb..5f12065 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,104 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Zhuohao Lin + * [LinkedIn](https://www.linkedin.com/in/zhuohao-lin-960b54194/) +* Tested on: Windows 10, i7-10875H @ 2.30GHz 16GB, NVIDIA Grforce RTX 2060 6GB (personal machine) + +# Overview +In this project, I implemented some widely used parallel algorithms such as work-efficient scan and stream compaction. I also implemented naive scan,CPU scan and CPU stream compaction for performance comparison. + +## Features +* CPU scan, CPU stream compaction with scan and without scan (for output and performance comparison) +* Naive scan in GPU: A simple algorithm for scan. +* Work-efficent scan in GPU: A more efficient algorithm for scan. I also made some optimization to reduce the threads usage in GPU. +* Stream Compaction in GPU: A algorithm to shrink the size of the data and keep useful data in order. + +# Output +The Output below is run with array size 223 (power-of-two arrays) and 223 - 3 (non-power-of-two arrays) +``` +**************** +** SCAN TESTS ** +**************** + [ 29 36 26 20 22 24 38 19 43 11 26 31 36 ... 43 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 11.7361ms (std::chrono Measured) + [ 0 29 65 91 111 133 157 195 214 257 268 294 325 ... 205475551 205475594 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 11.6701ms (std::chrono Measured) + [ 0 29 65 91 111 133 157 195 214 257 268 294 325 ... 205475522 205475524 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 8.80413ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 8.74237ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 4.87853ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 3.78675ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.464672ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.476544ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 0 2 2 0 0 0 3 3 1 2 1 0 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 18.0229ms (std::chrono Measured) + [ 3 2 2 3 3 1 2 1 3 2 1 1 2 ... 1 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 17.9753ms (std::chrono Measured) + [ 3 2 2 3 3 1 2 1 3 2 1 1 2 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 39.9532ms (std::chrono Measured) + [ 3 2 2 3 3 1 2 1 3 2 1 1 2 ... 1 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 5.8511ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 5.53261ms (CUDA Measured) + passed +``` + +# Performance Analysis + +## Optimal Block Size + +In order to find the optimal block size, I use the optimized work-efficient algorithm. I tested with different block sizes and measured the excution time. Shorter execution time means better performance. + +![](img/OptimalBlockSize1.PNG) +![](img/OptimalBlockSize2.PNG) + +According to the 2 graphs above, we can see that the performance improves as the block size increases. After reaching block size 128, increasing block size doesn't improve the performance anymore for either scan or stream compaction. Therefore, 128 is the optimal block size for both scan and stream compaction. + +## Performance of Different Scan Algorithms +![](img/Scan1.PNG) +![](img/Scan2.PNG) + +The two graphs above shows the performance of different implementations with different array sizes. With small array size, CPU scan has better performance. As array size becomes larger enough, all GPU scan methods outweigh CPU scan. + +Among GPU methods, Thrust implementation always has the best performance. Work-efficient method has poorer performance than naive on small arrays but significant better performance on large arrays. + +
+It's worth mentioning that the work-efficient scan algorithm is optimized on threads usage. Before optimization, the work-efficient scan has even worse performance than naive scan. This is because I used % (module operator) which is slow on GPU. Moreover, before optimization, most of threads are simply waiting for few threads to finish computation. By calculating the index of data which needs to be changed within every thread, I can free up most threads to make the scan more efficient. + +
+ +## Performance of Different Stream Compaction Algorithms +![](img/StreamCompaction.PNG) +From the graph, it's obvious that stream compaction on GPU using work-efficient scan has much better performance on large arrays than CPU approches. This is expected since stream compaction on GPU saves a lot of time with work-efficient scan. + -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/OptimalBlockSize1.PNG b/img/OptimalBlockSize1.PNG new file mode 100644 index 0000000..15c16e9 Binary files /dev/null and b/img/OptimalBlockSize1.PNG differ diff --git a/img/OptimalBlockSize2.PNG b/img/OptimalBlockSize2.PNG new file mode 100644 index 0000000..d00e619 Binary files /dev/null and b/img/OptimalBlockSize2.PNG differ diff --git a/img/Scan1.PNG b/img/Scan1.PNG new file mode 100644 index 0000000..0de5b5e Binary files /dev/null and b/img/Scan1.PNG differ diff --git a/img/Scan2.PNG b/img/Scan2.PNG new file mode 100644 index 0000000..162bb3e Binary files /dev/null and b/img/Scan2.PNG differ diff --git a/img/StreamCompaction.PNG b/img/StreamCompaction.PNG new file mode 100644 index 0000000..b715757 Binary files /dev/null and b/img/StreamCompaction.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..74bef51 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 19; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..84b1cb2 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,5 @@ #include "common.h" +#include void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -24,6 +25,10 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + bools[index] = idata[index] ? 1 : 0; } /** @@ -33,6 +38,13 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + if (bools[index]) + { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..8f666a8 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,12 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + // Exclusive scan + odata[0] = 0; + for (int i = 1; i < n; ++i) + { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -31,8 +37,17 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int osize = 0; + for (int i = 0; i < n; ++i) + { + if (idata[i] != 0) + { + odata[osize] = idata[i]; + osize++; + } + } timer().endCpuTimer(); - return -1; + return osize; } /** @@ -41,10 +56,35 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int* tmp = new int[n]; + tmp[0] = 0; timer().startCpuTimer(); // TODO + // Check if corresponding elements meet criteria + for (int i = 0; i < n; ++i) + { + odata[i] = idata[i] ? 1 : 0; + } + + // Run exclusive scan + tmp[0] = 0; + for (int i = 1; i < n; ++i) + { + tmp[i] = odata[i - 1] + tmp[i - 1]; + } + int size = tmp[n - 1]; + + // Scatter + for (int i = 0; i < n - 1; ++i) + { + if (tmp[i] != tmp[i + 1]) + { + odata[tmp[i]] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + delete[] tmp; + return size; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..41cc267 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,9 @@ #include #include "common.h" #include "efficient.h" +#include + +#define USE_OPTIMIZATION 1 namespace StreamCompaction { namespace Efficient { @@ -12,13 +15,98 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int stride, int *odata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); +#if USE_OPTIMIZATION + index = stride * index + stride - 1; +#endif + if (index >= n) return; + +#if USE_OPTIMIZATION + odata[index] += odata[index - stride / 2]; +#else + if ((index + 1) % stride == 0) + { + odata[index] += odata[index - stride / 2]; + } +#endif + + } + + __global__ void kernDownSweep(int n, int stride, int* odata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); +#if USE_OPTIMIZATION + index = stride * index + stride - 1; +#endif + if (index >= n) return; + +#if USE_OPTIMIZATION + int leftChildVal = odata[index]; + odata[index] += odata[index - stride / 2]; + odata[index - stride / 2] = leftChildVal; +#else + if ((index + 1) % stride == 0) + { + int leftChildVal = odata[index]; + odata[index] += odata[index - stride / 2]; + odata[index - stride / 2] = leftChildVal; + } +#endif + + } + + void efficientScan(int n, int levelCount, int* dev_odata, int blockSize) + { + dim3 blockNum = (n + blockSize - 1) / blockSize; + int stride = 1; +#if USE_OPTIMIZATION + int sizeRequired = n; +#endif + // Up-Sweep + for (int d = 0; d < levelCount; ++d) + { +#if USE_OPTIMIZATION + sizeRequired /= 2; + blockNum = (sizeRequired + blockSize - 1) / blockSize; +#endif + stride *= 2; + kernUpSweep<<>>(n, stride, dev_odata); + } + + // Down-Sweep + cudaMemset(dev_odata + n - 1, 0, sizeof(int)); + for (int d = levelCount - 1; d >= 0; --d) + { +#if USE_OPTIMIZATION + sizeRequired *= 2; + blockNum = (sizeRequired + blockSize - 1) / blockSize; +#endif + kernDownSweep<<>>(n, stride, dev_odata); + stride /= 2; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_odata; + int levelCount = ilog2ceil(n); + int arraySize = 1 << levelCount; + cudaMalloc((void**)&dev_odata, arraySize * sizeof(int)); + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + int blockSize = 128; + efficientScan(arraySize, levelCount, dev_odata, blockSize); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_odata); } /** @@ -31,10 +119,40 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int *dev_idata; + int *dev_odata; + int *dev_bools; + int *dev_indices; + int levelCount = ilog2ceil(n); + int arraySize = 1 << levelCount; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, arraySize * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + int blockSize = 128; + dim3 blockNum = (n + blockSize - 1) / blockSize; + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + cudaMemset(dev_indices + n, 0, (arraySize - n) * sizeof(int)); + efficientScan(arraySize, levelCount, dev_indices, blockSize); + + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); + timer().endGpuTimer(); - return -1; + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + int elementCount = 0; + cudaMemcpy(&elementCount, dev_indices + arraySize - 1, sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + return elementCount; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..50ded00 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "naive.h" +#include namespace StreamCompaction { namespace Naive { @@ -12,14 +13,66 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernNaiveScan(int n, int stride, int *idata, int *odata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + if (index >= stride) + { + odata[index] = idata[index - stride] + idata[index]; + } + else + { + odata[index] = idata[index]; + } + } + + __global__ void kernShiftRight(int n, int* idata, int* odata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + if (index == 0) + { + odata[index] = 0; + } + else + { + odata[index] = idata[index - 1]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *dev_idata; + int *dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + int blockSize = 128; + dim3 blockNum = (n + blockSize - 1) / blockSize; + for (int stride = 1; stride < 2 * n; stride *= 2) + { + kernNaiveScan<<>>(n, stride, dev_idata, dev_odata); + std::swap(dev_idata, dev_odata); + + } + // Convert from inclusive scan to exclusive scan + // No need to swap buffers here since they're swapped in for loop. + kernShiftRight<<>>(n, dev_idata, dev_odata); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..f00769e 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,16 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }