diff --git a/README.md b/README.md index 0e38ddb..91a1a85 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,103 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Hanlin Sun + * [LinkedIn](https://www.linkedin.com/in/hanlin-sun-7162941a5/), + * [personal website](https://hanlinsun.github.io/) +* Tested on: Windows 10, i7-8750H @ 3.2GHz 32GB, NVIDIA Quadro P3200 -### (TODO: Your README) +# Stream Compaction +This Project involves: -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* CPU version of scan +* CPU version of scan without using scan +* CPU version of compact with scan +* GPU version of naive scan +* GPU version of work-efficient scan +* GPU version of String Compact scan +These three CPU implements was used to test whether GPU implements was right. I have collected the data across 8 executions with different array sizes to collect the data. +This program generates a new array of random values with each execution, where the size of array is customisable. I have varied the size of the arrays by powers of two, starting from 2^8^ all the wai to 2^20^. The program also executes each algorithm for arrays of size "non- power of two" which are generated truncating the "power of two" arrays. + + +# Output Results + +``` +**************** +** SCAN TESTS ** +**************** + [ 21 28 22 23 38 18 20 9 44 26 14 10 3 ... 25 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0015ms (std::chrono Measured) + [ 0 21 49 71 94 132 150 170 179 223 249 263 273 ... 6273 6298 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0014ms (std::chrono Measured) + [ 0 21 49 71 94 132 150 170 179 223 249 263 273 ... 6223 6226 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.25088ms (CUDA Measured) + [ 0 21 49 71 94 132 150 170 179 223 249 263 273 276 ... 6298 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.23552ms (CUDA Measured) + [ 0 21 49 71 94 132 150 170 179 223 249 263 273 276 ... 0 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.185344ms (CUDA Measured) + [ 0 21 49 71 94 132 150 170 179 223 249 263 273 ... 6273 6298 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.185344ms (CUDA Measured) + [ 0 21 49 71 94 132 150 170 179 223 249 263 273 ... 6223 6226 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 13.1092ms (CUDA Measured) + [ 0 21 49 71 94 132 150 170 179 223 249 263 273 ... 6273 6298 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 2.18214ms (CUDA Measured) + [ 0 21 49 71 94 132 150 170 179 223 249 263 273 ... 6223 6226 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 2 2 3 2 2 0 3 0 0 0 0 1 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0014ms (std::chrono Measured) + [ 1 2 2 3 2 2 3 1 1 3 2 1 1 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0013ms (std::chrono Measured) + [ 1 2 2 3 2 2 3 1 1 3 2 1 1 ... 3 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0038ms (std::chrono Measured) + [ 1 2 2 3 2 2 3 1 1 3 2 1 1 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.309248ms (CUDA Measured) + [ 1 2 2 3 2 2 3 1 1 3 2 1 1 ... 3 3 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.274432ms (CUDA Measured) + [ 1 2 2 3 2 2 3 1 1 3 2 1 1 ... 3 1 ] + passed +``` + +# Performance Analysis + +## Scanning Algorithm + +![Scan Chart](img/Analysis%201.JPG) + +In this chart, the lower the attribute is, the better its performance. +We can see that when the array number is generally small, the performance gap between naive method and work-efficient method is generally small, but with the array length increased, this gap become larger and larger. +The reason why the second method is more efficient is that even though we have limited the number of active threads, when we are doing naive scan, threads which are not doing anything have to wait for the other active threads in the warp to finish to become available again. +But in upsweep and downsweep method, upsweep only use half number of threads to finish the work, and the rest of threads can be utilized by the GPU to do other tasks(like downsweep). So through that method we launch the same number of threads, but use less depth than the naive method. +That's why it is way more faster. + +## String Compaction + +![Compact Chart](img/Analysis%202.JPG) +In this chart, the lower the attribute is, the better its performance. diff --git a/img/Analysis 1.JPG b/img/Analysis 1.JPG new file mode 100644 index 0000000..de69e2a Binary files /dev/null and b/img/Analysis 1.JPG differ diff --git a/img/Analysis 2.JPG b/img/Analysis 2.JPG new file mode 100644 index 0000000..cb38056 Binary files /dev/null and b/img/Analysis 2.JPG differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..39b1bf2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,35 +64,35 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); + StreamCompaction::Efficient::scan(SIZE, c, a,false); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); + StreamCompaction::Efficient::scan(NPOT, c, a,false); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..91f4fef 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -22,8 +22,16 @@ namespace StreamCompaction { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + __global__ void kernMapToBoolean(int n, int *bools, const int *idata) + { // TODO + //find each index + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + { + return; + } + bools[index] = idata[index] != 0 ? 1 : 0; } /** @@ -33,6 +41,16 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + { + return; + } + if (bools[index] !=0) + { + int targetIdx = indices[index]; + odata[targetIdx] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..e4e9817 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -15,11 +15,19 @@ namespace StreamCompaction { /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. - * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. + * (Optional) For better understanding before starting moving to GPU, + you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) + { + odata[i] = odata[i-1] + idata[i-1]; + } + + //Why the last two digit different? timer().endCpuTimer(); } @@ -28,11 +36,22 @@ namespace StreamCompaction { * * @returns the number of elements remaining after compaction. */ + //Well I don't know exactly the condition + //So I treat it as remove 0 I guess int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int j = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] > 0) + { + odata[j] = idata[i]; + j++; + } + } timer().endCpuTimer(); - return -1; + return j; } /** @@ -41,10 +60,42 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int* boolArray = new int[n * sizeof(int)]; + int* scanArray = new int[n * sizeof(int)]; timer().startCpuTimer(); // TODO + for (int i = 0; i < n; i++) + { + boolArray[i] = (idata[i] > 0) ? 1 : 0; + } + //Set temp array + + //begin scan + //Inclusive scan + scanArray[0] = boolArray[0]; //identity + for (int i = 1; i < n; i++) + { + scanArray[i] = scanArray[i-1] + boolArray[i]; + } + int elementNum = scanArray[n - 1]; + //Shift to right + //Exclusive scan + for (int i = n; i > 0; i--) + { + scanArray[i] = scanArray[i - 1]; + } + scanArray[0] = 0; + //Scatter + for (int i = 0; i < n; i++) + { + if (boolArray[i] > 0) + { + odata[scanArray[i]]=idata[i]; + } + } timer().endCpuTimer(); - return -1; + + return elementNum; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..10e9978 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,21 +6,116 @@ namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; + using StreamCompaction::Common::kernMapToBoolean; + using StreamCompaction::Common::kernScatter; PerformanceTimer& timer() { static PerformanceTimer timer; return timer; } +#define blockSize 256 + __global__ void KernUpSweep(int n, int* data,int d) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + //real offset is 2^power + if (index >= n) + return; + int pow1 = 1 << (d + 1); + int pow2 = 1 << d; + if (index % pow1 == 0) + { + data[index + pow1 - 1] += data[index + pow2 - 1]; + } + } + + __global__ void KernDownSweep(int n,int* data,int d) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + int pow1 = 1 << (d + 1); + int pow2 = 1 << d; + if (index % pow1 == 0) + { + int t = data[index + pow2 - 1]; + data[index + pow2 - 1] = data[index + pow1 - 1]; + data[index + pow1 - 1] += t; + } + } + + + //set n-1 =0 + + __global__ void KernSetZero(int n,int* idata) + { + idata[n - 1] = 0; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + + void scan(int n, int *odata, const int *idata, bool gpuTimerStart) { + int* dev_data; + int* dev_buffer; + + int log2n = ilog2ceil(n); + //input array may not be two power + //So need to get nearest two power + int nearest_2power = 1 << log2n; + int finalMemorySize = nearest_2power; + int difference = finalMemorySize-n; + + dim3 fullBlocksPergrid((finalMemorySize + blockSize - 1) / blockSize); + + //allocate cuda memoty + cudaMalloc((void**)&dev_data, finalMemorySize * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + + cudaMalloc((void**)&dev_buffer, finalMemorySize * sizeof(int)); + checkCUDAError("cudaMemset dev_buffer failed!"); + + cudaMemset(dev_data, 0, finalMemorySize * sizeof(int)); + checkCUDAError("cudaMemset dev_data failed!"); + + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_data failed!"); + + cudaMemcpy(dev_buffer, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_buffer failed!"); + + if (gpuTimerStart == false) + { + timer().startGpuTimer(); + } + // TODO - timer().endGpuTimer(); + int real_d = ilog2ceil(finalMemorySize); + //upsweep + for (int d = 0; d <= real_d - 1; d++) + { + KernUpSweep << > > (finalMemorySize,dev_data,d); + checkCUDAError("KernupSweep failed!"); + + } + //down Sweep + KernSetZero << < 1, 1 >> > (finalMemorySize, dev_data); + for (int d = real_d - 1; d >= 0; d--) + { + KernDownSweep << > > (nearest_2power,dev_data,d); + checkCUDAError("KernDownSweep failed!"); + } + + if (gpuTimerStart == false) + { + timer().endGpuTimer(); + } + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_data); } + + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +126,54 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int* count = new int[2]; + + int* dev_idata; + int* dev_odata; + int* dev_bool; + int* dev_boolScan; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("CUDA Malloc dev_idata failed!"); + cudaMalloc((void**)&dev_bool, n * sizeof(int)); + checkCUDAError("CUDA Malloc dev_bool failed!"); + cudaMalloc((void**)&dev_boolScan, n * sizeof(int)); + checkCUDAError("CUDA Malloc dev_boolScan failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("CUDA Malloc dev_odata failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + kernMapToBoolean << > > (n, dev_bool, dev_idata); + checkCUDAError("kernMapToBoolean failed!"); + + scan(n, dev_boolScan, dev_bool,true); + + kernScatter << > > (n, dev_odata, dev_idata, dev_bool, dev_boolScan); + checkCUDAError("kernScatter failed!"); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(count, dev_bool + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + cudaMemcpy(count+1, dev_boolScan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + //size equals to last of boolean array and last of boolean prefix sum array + int compactedSize = count[0] + count[1]; + + cudaMemcpy(odata, dev_odata, sizeof(int) * compactedSize, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy back failed!"); + + cudaFree(dev_idata); + cudaFree(dev_bool); + cudaFree(dev_boolScan); + cudaFree(dev_odata); + + return compactedSize; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..18b0f55 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata,bool gpuTimer); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..71e6504 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -13,13 +13,116 @@ namespace StreamCompaction { } // TODO: __global__ + __global__ void KernShiftToRight(int n,int* odata,int* idata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + { + return; + } + if (index == 0) + { + odata[index] = 0; + } + odata[index] = idata[index - 1]; + } + + __global__ void KernRightShiftAddZeros(int* odata, int* middle_buffer, int n, int difference) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n ) + { + return; + } + if (index > (n - 1) - difference) + { + odata[index] = 0; + return; + } + odata[index] = middle_buffer[index]; + } + + + __global__ void KernNaiveScan(int n,int d,int* odata,const int* idata) + { + //for all k in parallel + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + { + return; + } + //offset: 2^d + // 2^(offset-1) + int d_offset = 1 << (d - 1); + + int beginIndex = index - d_offset; + int prevData = beginIndex >= 0 ? idata[beginIndex] : 0; + odata[index] = idata[index] + prevData; + + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + void scan(int n, int *odata, const int *idata) { + int blockSize = 256; + + //This need to be parallel + int* dev_idata; + int* dev_odata; + int* dev_middleBuffer; + + //allocate memory + cudaMalloc((void**)&dev_idata, sizeof(int) * n); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, sizeof(int) * n); + checkCUDAError("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_middleBuffer, sizeof(int) * n); + checkCUDAError("cudaMalloc dev_middleBuffer failed!"); + + cudaDeviceSynchronize(); + + //Copy memory from CPU to gpu + cudaMemcpy(dev_idata,idata,(n)*sizeof(int),cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, idata, (n) * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_middleBuffer, idata, (n) * sizeof(int), cudaMemcpyHostToDevice); + + cudaDeviceSynchronize(); + //From host to devicw + int log2n = ilog2ceil(n); + int finalMemorySize = 1 << log2n; + int difference = finalMemorySize - n; + + + dim3 BlocksPergrid(finalMemorySize + blockSize - 1 / blockSize); + timer().startGpuTimer(); // TODO + KernRightShiftAddZeros<<>>(dev_idata,dev_middleBuffer,finalMemorySize,difference); + for (int d = 1; d <= ilog2ceil(finalMemorySize); d++) + { + + KernNaiveScan << > > (finalMemorySize,d,dev_odata,dev_idata); + cudaDeviceSynchronize(); + //ping pong buffers + int *dev_temp = dev_idata; + dev_idata = dev_odata; + dev_odata = dev_temp; + } + KernShiftToRight << > > (finalMemorySize,dev_odata,dev_idata); + cudaDeviceSynchronize(); + + timer().endGpuTimer(); + //Exclusive scan, so need right shift. + + //copy back to host + cudaMemcpy(odata , dev_idata, (n) * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy odata failed!"); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_middleBuffer); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..e766c7e 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -17,12 +17,19 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int* odata, const int* idata) { + thrust::host_vector host_data(n); + for (int i = 0; i < n; i++) + { + host_data[i] = idata[i]; + } + + thrust::device_vector dev_data = host_data; timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_data.begin(), dev_data.end(), dev_data.begin()); timer().endGpuTimer(); + + thrust::copy(dev_data.begin(), dev_data.end(), odata); } } }