diff --git a/README.md b/README.md index 0e38ddb..ac899a7 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,81 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Yiyang Chen + * [LinkedIn](https://www.linkedin.com/in/yiyang-chen-6a7641210/), [personal website](https://cyy0915.github.io/) +* Tested on: Windows 10, i5-8700k @ 3.7GHz, GTX 1080, personal computer -### (TODO: Your README) +## Features +I implemented part 1 to part 5 -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +For part 5, I optimized it so that when array size is large enough, efficient is always faster than naive, and naive is always faster than cpu. I did it by optimize the number of thread in up sweep and down sweep +## Performance Analysis +I have roughly optimized the block sizes. I found that 128 is ok. + +![](img/scan.png) + +![](img/compact.png) + +See the above graphs, notice the axis + +* **Phenomena:** In `scan` function, when the array length is large enough (>2^18), for runtime, cpu > naive > efficient > thrust. And in `compact` function, also when the array length is large enough, for runtime, cpu > efficient. However, when the array length is small, cpu < naive < thrust < efficient. + +* **Explanation:** + * cpu: O(n) time complexity, very normal. I think the bottleneck is computation. + * naive: run parallel in GPU, so it's faster than cpu implementation when the array length is large. I think when the array length is small the bottleneck is memory I/O, so it's slower than cpu. + * efficient: run parallel in GPU and don't need two buffers, so it's faster than naive when the array length is large. I think when the array length is small the bottleneck is also memory I/O, and it invoke more device functions compared to naive implementation, so it's even slower than naive. + + +```**************** +** SCAN TESTS ** +**************** + [ 23 47 43 2 9 23 22 34 41 24 17 1 6 ... 40 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 1.4016ms (std::chrono Measured) + [ 0 23 70 113 115 124 147 169 203 244 268 285 286 ... 25709761 25709801 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 1.4073ms (std::chrono Measured) + [ 0 23 70 113 115 124 147 169 203 244 268 285 286 ... 25709686 25709735 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.76112ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.763008ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.476032ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.479264ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.191008ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.213728ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 1 3 2 1 3 2 2 3 2 3 1 0 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.1584ms (std::chrono Measured) + [ 1 1 3 2 1 3 2 2 3 2 3 1 3 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.0913ms (std::chrono Measured) + [ 1 1 3 2 1 3 2 2 3 2 3 1 3 ... 3 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 5.7503ms (std::chrono Measured) + [ 1 1 3 2 1 3 2 2 3 2 3 1 3 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.618016ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.636352ms (CUDA Measured) + passed``` \ No newline at end of file diff --git a/img/compact.png b/img/compact.png new file mode 100644 index 0000000..1dded4c Binary files /dev/null and b/img/compact.png differ diff --git a/img/scan.png b/img/scan.png new file mode 100644 index 0000000..298b95d Binary files /dev/null and b/img/scan.png differ diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..86dd09e 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,10 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + bools[index] = idata[index] != 0; + } } /** @@ -32,7 +35,12 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..f86e526 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,10 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +33,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int oi = 0; + for (int i = 0; i < n; i++) { + if (idata[i] > 0) { + odata[oi] = idata[i]; + oi++; + } + } timer().endCpuTimer(); - return -1; + return oi; } /** @@ -42,9 +51,25 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int* bools = new int[n], * indices = new int[n]; + for (int i = 0; i < n; i++) { + bools[i] = idata[i] != 0; + } + //scan + indices[0] = 0; + for (int i = 1; i < n; i++) { + indices[i] = indices[i - 1] + bools[i - 1]; + } + for (int i = 0; i < n; i++) { + if (idata[i] > 0) { + odata[indices[i]] = idata[i]; + } + } + int count = indices[n - 1] + bools[n - 1]; + delete[] bools; + delete[] indices; timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..0a4dc4d 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,15 +12,54 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int t, int* data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + data[t * index + t - 1] += data[t * index + (t >> 1) - 1]; + } + } + __global__ void kernDownSweep(int n, int t, int* data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + int tmp = data[t * index + (t >> 1) - 1]; + data[t * index + (t >> 1) - 1] = data[t * index + t - 1]; + data[t * index + t - 1] += tmp; + } + } + __global__ void kernSetArray(int value, int index, int* data) { + data[index] = value; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int blockSize = 128; + int* buffer; + int N = 1 << ilog2ceil(n); + cudaMalloc((void**)&buffer, N * sizeof(int)); + cudaMemcpy(buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + for (int d = 0; d < ilog2ceil(n); d++) { + int computeCount = n >> d + 1; //up sweep is n, down sweep is N. It's not a typo + dim3 blockCount((computeCount + blockSize - 1) / blockSize); + kernUpSweep << > > (computeCount, 1<> > (0, N - 1, buffer); + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + int computeCount = N >> d + 1; + dim3 blockCount((computeCount + blockSize - 1) / blockSize); + kernDownSweep << > > (computeCount, 1 << d+1, buffer); + } timer().endGpuTimer(); + + cudaMemcpy(odata, buffer, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer); } + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +70,50 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int* bools, * indices, * ibuffer, * obuffer; + cudaMalloc((void**)&bools, n * sizeof(int)); + cudaMalloc((void**)&indices, n * sizeof(int)); + cudaMalloc((void**)&ibuffer, n * sizeof(int)); + cudaMalloc((void**)&obuffer, n * sizeof(int)); + cudaMemcpy(ibuffer, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + int blockSize = 128; + dim3 blockCount((n + blockSize - 1) / blockSize); + timer().startGpuTimer(); - // TODO + Common::kernMapToBoolean<<>>(n, bools, ibuffer); + + //------------------ scan --------------------- + int N = 1 << ilog2ceil(n); + cudaMemcpy(indices, bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + + for (int d = 0; d < ilog2ceil(n); d++) { + int computeCount = n >> d + 1; //up sweep is n, down sweep is N. It's not a typo + dim3 blockCount((computeCount + blockSize - 1) / blockSize); + kernUpSweep << > > (computeCount, 1 << d + 1, indices); //todo non power of 2 + } + kernSetArray << <1, 1 >> > (0, N - 1, indices); + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + int computeCount = N >> d + 1; + dim3 blockCount((computeCount + blockSize - 1) / blockSize); + kernDownSweep << > > (computeCount, 1 << d + 1, indices); + } + //------------------ scan --------------------- + + Common::kernScatter << > > (n, obuffer, ibuffer, bools, indices); timer().endGpuTimer(); - return -1; + + int count, bias; + cudaMemcpy(&count, indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&bias, bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + int remainNum = count + bias; + cudaMemcpy(odata, obuffer, remainNum * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(bools); + cudaFree(indices); + cudaFree(ibuffer); + cudaFree(obuffer); + return remainNum; //todo } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..f4ca8b0 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,14 +12,42 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernScan(int n, int bias, int* odata, const int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= bias && index < n) { + odata[index] = idata[index - bias] + idata[index]; + } + else { + odata[index] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int* odata, const int* idata) { + int blockSize = 128; + dim3 blockCount((n + blockSize - 1) / blockSize); + + int* buffer1, *buffer2; + cudaMalloc((void**)&buffer1, n * sizeof(int)); + cudaMalloc((void**)&buffer2, n * sizeof(int)); + cudaMemcpy(buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + int in = ilog2ceil(n); + for (int d = 1; d <= in; d++) { + int bias = 1 << d - 1; + kernScan<<>>(n, bias, buffer2, buffer1); + + std::swap(buffer1, buffer2); + } timer().endGpuTimer(); + + cudaMemcpy(odata + 1, buffer1, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + cudaFree(buffer1); + cudaFree(buffer2); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..23f89b3 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,19 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector thrust_idata(idata, idata + n); + thrust::device_vector thrust_odata(n); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(thrust_idata.begin(), thrust_idata.end(), thrust_odata.begin()); + timer().endGpuTimer(); + + thrust::copy(thrust_odata.begin(), thrust_odata.end(), odata); } } }