diff --git a/README.md b/README.md index 0e38ddb..ff3ce43 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,296 @@ -CUDA Stream Compaction +University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2 - CUDA Stream Compaction ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE +* Di Lu * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Tested on: Windows 11, i7-12700H @ 2.30GHz 32GB, NVIDIA GeForce RTX 3050 Ti + +## Introduction + +In this project, I implemented the following algorithms on the GPU and tested them: + +1. Exclusive Scan (CPU, Naive Scan, Work-Efficient Scan, Thrust Scan) - given an array A, output another array B such that each element b\[i\] +is a sum of a\[0\] + ... + a\[i - 1\] excluding itself. +2. Stream Compaction - given an array A, output another array B that only contains elements from A which match a criteria. + +## Implementation and Results +#### CPU: Sequential Scan + +* On the CPU, a sequential scan consists of a simple for-loop that loops through all the elements +of input array A, and outputs to B[i] = B[i - 1] + A[i - 1] + +#### CPU: Stream Compact without Scan + +* Stream compact without Scan will simply keep a counter of the current write-index in the output array B, +while iterating through input array A. If A[i] != 0, then B[output_index] = A[i]. + +#### CPU: Stream Compact with Scan + +* Stream compact with scan is a sequential implementation of the stream compact algorithm for GPU, except on the CPU +and thus none of the advantages of parallel programming will come into play. See the following section for GPU stream +compaction algorithms. + +#### GPU: Naive GPU Scan + +* A naive GPU scan will take an input array (or read array), and add each sequential pair of elements together +into an output array (or write array), ping-pong the arrays (read is now write and vice versa), increment the +additive offset, and then repeat the operation until there is only one output (the final sum). Here, each addition +can be done in a parallel manner since we will never write to the same array slot. The only caveat is that you must +wait for each level to finish its read/write before moving on to the next level. + +#### GPU: Work-Efficient GPU Scan + +* A work efficient exclusive GPU scan uses only one array and can do the operation in place. This array is treated as a balanced +binary tree, where the left child always retains its original value from the read array. This will allow us to +retain some of the original values to help us recover the entire scan array after we do a down sweep. + +1. Up Sweep +For each consecutive pair of integers, add them and store the sum in the right-side partner's slot, overwriting it. +Do this until you only have two partners to add with one resulting sum, which will be written to the last array slot. +Now your array should have alternating "original" and "new" values. + +3. Down Sweep +Set the last integer in the array to 0. This is your root node. Starting from the root, set a node's left child to itself, +and sets its right child's value to a sum of itself and the previous left child's value (i.e, the sibling node's value). + +#### GPU: Thrust Scan + +* This scan is fairly simple to implement, as thrust::scan is a built-in function in the Thrust library. + +#### GPU: Stream Compaction + +* On the GPU, stream compaction consists of the following steps: + +1. For each element in A, compute a "boolean" array such that: b[i] = 1 if A[i] meets the criteria, and b[i] = 0 otherwise. +2. Run exclusive scan on the boolean array b to produce array s. +3. The final output array is computed such that: if b[i] = 1, then write A[i] to index s[i] in the output array. + +## Output Example + +The following is an example of the output for N = 2^20 +``` +**************** +** SCAN TESTS ** +**************** + [ 18 19 38 39 17 17 11 34 0 18 47 37 37 ... 2 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 2.1343ms (std::chrono Measured) + [ 0 18 37 75 114 131 148 159 193 193 211 258 295 ... 25679509 25679511 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 2.0678ms (std::chrono Measured) + [ 0 18 37 75 114 131 148 159 193 193 211 258 295 ... 25679459 25679462 ] + passed +==== naive scan, power-of-two ==== +blockSize: 256 + elapsed time: 1.40179ms (CUDA Measured) + [ 0 18 37 75 114 131 148 159 193 193 211 258 295 ... 25679509 25679511 ] + passed +==== naive scan, non-power-of-two ==== +blockSize: 256 + elapsed time: 1.40688ms (CUDA Measured) + [ 0 18 37 75 114 131 148 159 193 193 211 258 295 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== +blockSize: 256 + elapsed time: 1.63549ms (CUDA Measured) + [ 0 18 37 75 114 131 148 159 193 193 211 258 295 ... 25679509 25679511 ] + passed +==== work-efficient scan, non-power-of-two ==== +blockSize: 256 + elapsed time: 1.6831ms (CUDA Measured) + [ 0 18 37 75 114 131 148 159 193 193 211 258 295 ... 25679459 25679462 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.404608ms (CUDA Measured) + [ 0 18 37 75 114 131 148 159 193 193 211 258 295 ... 25679509 25679511 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.3792ms (CUDA Measured) + [ 0 18 37 75 114 131 148 159 193 193 211 258 295 ... 25679459 25679462 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 2 3 1 1 1 2 0 0 3 3 3 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.3575ms (std::chrono Measured) + [ 1 2 3 1 1 1 2 3 3 3 1 2 2 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.4421ms (std::chrono Measured) + [ 1 2 3 1 1 1 2 3 3 3 1 2 2 ... 1 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 6.5362ms (std::chrono Measured) + [ 1 2 3 1 1 1 2 3 3 3 1 2 2 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== +blockSize: 256 + elapsed time: 3.06957ms (CUDA Measured) + [ 1 2 3 1 1 1 2 3 3 3 1 2 2 ... 1 1 ] + passed +==== work-efficient compact, non-power-of-two ==== +blockSize: 256 + elapsed time: 2.88675ms (CUDA Measured) + [ 1 2 3 1 1 1 2 3 3 3 1 2 2 ... 1 1 ] + passed +``` + +## Testing Methods + +To test my program, each run of the program will produce a random array of numbers between a given +range. I varied the length of my arrays from size 4 up to 2^25. I also tried arrays with and without +a 0 at the very end. + +## Performance Analysis + +* Roughly optimize the block sizes of each of your implementations for minimal + run time on your GPU. + * In the following performance analysis, I used N = 2^25 array size + + ![](img/blocksizeTable.png) + + _Table. 1 Blocksize vs. Results for each algorithm_ + + ![](img/blocksize.png) + + _Fig. 1 Blocksize vs. Results for each algorithm_ + +* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and + Thrust) to the serial CPU version of Scan. Plot a graph of the comparison + (with array size on the independent axis). + #### Scan + ![](img/scanTable.png) + + _Table. 2 Scan Algorithm: Number of Elements vs. Results_ + + ![](img/scanpow2.png) + + _Fig. 2 Scan Algorithm: Number of Elements vs. Results Power of 2_ + + ![](img/scannotpow2.png) + + _Fig. 3 Scan Algorithm: Number of Elements vs. Results NON Power of 2_ + + In Scan, the performance results between Non-Power of 2 arrays and power of 2 arrays are fairly similar. + As expected, CPU scan is the least efficient algorithm when there are many elements. Naive scan is slightly + more efficient, although not by a lot. Efficient GPU scan does technically run faster than CPU and Naive, but + the effects are not seen unless the size of the input array is above 1.5 million. Thrust scan is the fastest + out of all of them. + +* Write a brief explanation of the phenomena you see here. + * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is + it different for each implementation? + +I am not surprised that the GPU efficient implementation isn't faster until the size of the input array gets + significantly large. This is because I didn't optimize my implementations to ensure that threads are not idling. + For example, I'm only using every other thread, then every 4 threads, etc. during scan. A way to bypass this is to + launch fewer threads, but computing the destination it should go to based on its idx and the current up-sweep or + down-sweep index. + +Additionally, I ensured that my GPU timers included as few cudaMallocs, Memcpys, and Memsets as possible, but it's not + always feasible to move all of them outside of my "main" algorithm code, so to speak. For example, I need to call + cudaMalloc on an array where I only know its length after a value is computed during the algorithm. Hence, I can only cudaMalloc + within the algorithm. + + * To guess at what might be happening inside the Thrust implementation (e.g. + allocation, memory copy), take a look at the Nsight timeline for its + execution. Your analysis here doesn't have to be detailed, since you aren't + even looking at the code for the implementation. + + + + #### Stream Compaction + ![](img/streamTable.png) + + _Table. 3 Stream Compaction Algorithm: Number of Elements vs. Results_ + + ![](img/streampow2.png) + + _Fig. 4 Stream Algorithm: Number of Elements vs. Results Power of 2_ + + ![](img/streamnotpow2.png) + + _Fig. 5 Steram Algorithm: Number of Elements vs. Results NON Power of 2_ + +* Paste the output of the test program into a triple-backtick block in your + README. + * See above + +## NSight Performance Confusion + +When running my program in Nsight, the results are extremely fast for N = 2^25 compared to +Release mode in Visual Studio. I'm not exactly sure why. + +```**************** +** SCAN TESTS ** +**************** + [ 17 20 19 7 22 46 11 3 35 15 31 39 34 ... 20 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 2.0798ms (std::chrono Measured) + [ 0 17 37 56 63 85 131 142 145 180 195 226 265 ... 25666495 25666515 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 2.0982ms (std::chrono Measured) + [ 0 17 37 56 63 85 131 142 145 180 195 226 265 ... 25666419 25666467 ] + passed +==== naive scan, power-of-two ==== +blockSize: 256 + elapsed time: 1.716ms (CUDA Measured) + [ 0 17 37 56 63 85 131 142 145 180 195 226 265 ... 25666495 25666515 ] + passed +==== naive scan, non-power-of-two ==== +blockSize: 256 + elapsed time: 2.00374ms (CUDA Measured) + [ 0 17 37 56 63 85 131 142 145 180 195 226 265 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== +blockSize: 256 + elapsed time: 2.3617ms (CUDA Measured) + [ 0 17 37 56 63 85 131 142 145 180 195 226 265 ... 25666495 25666515 ] + passed +==== work-efficient scan, non-power-of-two ==== +blockSize: 256 + elapsed time: 2.23104ms (CUDA Measured) + [ 0 17 37 56 63 85 131 142 145 180 195 226 265 ... 25666419 25666467 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.567744ms (CUDA Measured) + [ 0 17 37 56 63 85 131 142 145 180 195 226 265 ... 25666495 25666515 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.566816ms (CUDA Measured) + [ 0 17 37 56 63 85 131 142 145 180 195 226 265 ... 25666419 25666467 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 2 3 1 2 0 1 3 3 3 1 1 0 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.3101ms (std::chrono Measured) + [ 3 2 3 1 2 1 3 3 3 1 1 3 1 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.2745ms (std::chrono Measured) + [ 3 2 3 1 2 1 3 3 3 1 1 3 1 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 6.392ms (std::chrono Measured) + [ 3 2 3 1 2 1 3 3 3 1 1 3 1 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== +blockSize: 256 + elapsed time: 4.70064ms (CUDA Measured) + [ 3 2 3 1 2 1 3 3 3 1 1 3 1 ... 2 2 ] + passed +==== work-efficient compact, non-power-of-two ==== +blockSize: 256 + elapsed time: 3.8247ms (CUDA Measured) +``` + -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/blocksize.png b/img/blocksize.png new file mode 100644 index 0000000..db7cf40 Binary files /dev/null and b/img/blocksize.png differ diff --git a/img/blocksizeTable.png b/img/blocksizeTable.png new file mode 100644 index 0000000..951f8f8 Binary files /dev/null and b/img/blocksizeTable.png differ diff --git a/img/elems.png b/img/elems.png new file mode 100644 index 0000000..d5ac6b0 Binary files /dev/null and b/img/elems.png differ diff --git a/img/scan.png b/img/scan.png new file mode 100644 index 0000000..a19a4b3 Binary files /dev/null and b/img/scan.png differ diff --git a/img/scanTable.png b/img/scanTable.png new file mode 100644 index 0000000..6616e02 Binary files /dev/null and b/img/scanTable.png differ diff --git a/img/scannotpow2.png b/img/scannotpow2.png new file mode 100644 index 0000000..253ebba Binary files /dev/null and b/img/scannotpow2.png differ diff --git a/img/scanpow2.png b/img/scanpow2.png new file mode 100644 index 0000000..817de79 Binary files /dev/null and b/img/scanpow2.png differ diff --git a/img/stream.png b/img/stream.png new file mode 100644 index 0000000..9699f10 Binary files /dev/null and b/img/stream.png differ diff --git a/img/streamTable.png b/img/streamTable.png new file mode 100644 index 0000000..cad7356 Binary files /dev/null and b/img/streamTable.png differ diff --git a/img/streamnotpow2.png b/img/streamnotpow2.png new file mode 100644 index 0000000..cf1d8e0 Binary files /dev/null and b/img/streamnotpow2.png differ diff --git a/img/streampow2.png b/img/streampow2.png new file mode 100644 index 0000000..3ea4e41 Binary files /dev/null and b/img/streampow2.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..67d5d7b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,9 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +// const int SIZE = 1 << 8; // feel free to change the size of array +// todo bring back +const int SIZE = 1 << 20; const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -28,6 +30,7 @@ int main(int argc, char* argv[]) { printf("****************\n"); genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + // genArray(SIZE, a, 50); // for cases without 0 at the end a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -51,7 +54,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,35 +67,35 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -103,6 +106,7 @@ int main(int argc, char* argv[]) { // Compaction tests genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + //genArray(SIZE, a, 4); // Case without 0 at the end; a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -137,18 +141,68 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("\n"); + printf("**********************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("**********************\n"); + + // Radix Sort tests + + genArray(SIZE - 1, a, 8); // Leave a 0 at the end to test that edge case + //genArray(SIZE, a, 4); // Case without 0 at the end; + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + int* d = new int[SIZE]; + int* e = new int[SIZE]; + + zeroArray(SIZE, b); + printDesc("control case thrust sort, power-of-two"); + StreamCompaction::Thrust::sort(SIZE, b, a); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("control case thrust sort, non-power-of-two"); + StreamCompaction::Thrust::sort(NPOT, c, a); + printArray(NPOT, c, true); + // do not comapre to array b. + + zeroArray(SIZE, d); + printDesc("radix sort, power-of-two"); + RadixSort::radixSort(SIZE, d, a); + printArray(SIZE, d, true); + printCmpResult(SIZE, d, b); + + /* zeroArray(SIZE, b); + printDesc("radix sort, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedCount = count; + printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b); + + zeroArray(SIZE, b); + printDesc("radix sort, non power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedCount = count; + printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b);*/ + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; delete[] c; + delete[] d; + delete[] e; } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..21c6943 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -13,26 +13,43 @@ namespace StreamCompaction { } /** - * CPU scan (prefix sum). + * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // perform exclusive prefix sum + for (int i = 0; i < n; i++) { + if (i == 0) { + odata[i] = 0; + } + else { + odata[i] = odata[i - 1] + idata[i - 1]; + } + } timer().endCpuTimer(); } /** * CPU stream compaction without using the scan function. - * + * * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // removes digits that are 0 + int oi = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[oi] = idata[i]; + oi++; + } + } timer().endCpuTimer(); - return -1; + + // return number of valid digits + return oi; } /** @@ -43,8 +60,46 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int* tdata = new int[n]; + int* sdata = new int[n]; + + int numvalid = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + tdata[i] = 1; + } + else { + // todo remove if initialize to 0 automatically + tdata[i] = 0; + } + } + + // perform scan on tdata to produce sdata + // perform exclusive prefix sum + for (int i = 0; i < n; i++) { + if (i == 0) { + sdata[i] = 0; + } + else { + sdata[i] = sdata[i - 1] + tdata[i - 1]; + } + } + + // scatter: iterate over tdata. if tdata is 1, write idata[i] to odata[sdata[i]] + // where odata.size is last elem of sdata + for (int i = 0; i < n; i++) { + if (tdata[i] == 1) { + odata[sdata[i]] = idata[i]; + } + } + + int numValid = sdata[n - 1] + tdata[n - 1]; + + delete[] tdata; + delete[] sdata; + timer().endCpuTimer(); - return -1; + return numValid; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..613f7f4 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,9 @@ #include #include "common.h" #include "efficient.h" +#include "iostream" + +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) namespace StreamCompaction { namespace Efficient { @@ -12,13 +15,190 @@ namespace StreamCompaction { return timer; } + /*! Block size used for CUDA kernel launch. */ + #define blockSizeEffScan 256 + #define blockSizeCompact 256 + + // fix this and use new function1 + __global__ void kernReductionHelper(int n, int offset, int* tdata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx < n && (idx + 1) % offset == 0) { + int neighborLoc = offset / 2; + int a = tdata[idx - neighborLoc]; + int b = tdata[idx]; + tdata[idx] = a + b; + } + } + + // use function from class + __global__ void kernPartialSumHelper(int n, int offset, int* tdata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx < n && (idx + 1) % offset == 0) { + int neighborLoc = offset / 2; + int a = tdata[idx - neighborLoc]; + int b = tdata[idx]; + + tdata[idx - neighborLoc] = b; + tdata[idx] = a + b; + } + } + + // shift all nums one to the left + __global__ void kernShiftLeft(int n, int* odata, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + if (index > 0) { + odata[index - 1] = idata[index]; + } + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int fullBlocksPerArray = (n + blockSizeEffScan - 1) / blockSizeEffScan; + + // for performance analysis + printf("blockSize: %i \n", blockSizeEffScan); + + // shift tidata to the right to prepend 0's. + int nextPowTwo = ilog2ceil(n); + int numZeroes = pow(2, nextPowTwo) - n; + + // 1. up sweep same as reduction + // empty buffer as idata + // malloc enough space for n and 0's + int* dev_tidata; + cudaMalloc((void**)&dev_tidata, (n + numZeroes) * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_tifailed failed!"); + + // create output array for shifting + int* dev_tidataFinal; + cudaMalloc((void**)&dev_tidataFinal, (n + numZeroes) * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_tidataFinal failed!"); + + // set all elems to 0 + cudaMemset(dev_tidata, 0, (n + numZeroes) * sizeof(int)); + checkCUDAErrorWithLine("cudaMemset all elems to 0 failed!"); + + // copy contents of idata into tidata so we can just pass in tidata and modify that on every pass. + // antyhing after n + numZeroes is all 0's + cudaMemcpy(dev_tidata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cudaMemcpy idata to dev_tidata failed!"); + + // start gpu timer timer().startGpuTimer(); - // TODO + + int depth = 0; + + for (depth = 1; depth <= ilog2ceil(n + numZeroes); depth++) { + int offset = pow(2, depth); + kernReductionHelper << >>(n + numZeroes, offset, dev_tidata); + // wait for cuda timer. wait for all threads to finish + cudaDeviceSynchronize(); + } + + // set last int of array to 0 + cudaMemset(&dev_tidata[n + numZeroes - 1], 0, sizeof(int)); + checkCUDAErrorWithLine("cudaMemset last int failed!"); + + // 2. down sweep + // takes dev_tidata as input + for (depth; depth >= 1; depth--) { + int offset = pow(2, depth); + kernPartialSumHelper << > > (n + numZeroes, offset, dev_tidata); + cudaDeviceSynchronize(); + } + + // stop gpu timer timer().endGpuTimer(); + + // copy final result to odata + cudaMemcpy(odata, dev_tidata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cudaMemcpy dev_tidata to odata failed!"); + + // free all buffers + cudaFree(dev_tidata); + cudaFree(dev_tidataFinal); + } + + __global__ void kernMapToBoolean(int n, int* odata, int* idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx < n) { + if (idata[idx] != 0) { + // add 1 to out array + odata[idx] = 1; + } + else { + // add 0 to outarray + odata[idx] = 0; + } + } + } + + __global__ void kernScatter(int n, int* odata, int* idata, int* tdata, int* sdata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx < n) { + if (tdata[idx] == 1) { + int destinationIdx = sdata[idx]; + odata[destinationIdx] = idata[idx]; + } + // otherwise do not write + } + } + + // reimplement scan for compact + void compactScan(int n, int* dev_odata, int* dev_idata) { + int fullBlocksPerArray = (n + blockSizeCompact - 1) / blockSizeCompact; + + // shift tidata to the right to prepend 0's. + int nextPowTwo = ilog2ceil(n); + int numZeroes = pow(2, nextPowTwo) - n; + + // 1. up sweep same as reduction + // empty buffer as idata + // malloc enough space for n and 0's + int* dev_tidata; + cudaMalloc((void**)&dev_tidata, (n + numZeroes) * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_tifailed failed!"); + + // set all elems to 0 + cudaMemset(dev_tidata, 0, (n + numZeroes) * sizeof(int)); + checkCUDAErrorWithLine("cudaMemset all elems to 0 failed!"); + + // copy contents of idata into tidata so we can just pass in tidata and modify that on every pass. + // antyhing after n + numZeroes is all 0's + cudaMemcpy(dev_tidata, dev_idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cudaMemcpy idata to dev_tidata failed!"); + + int depth = 0; + + for (depth = 1; depth <= ilog2ceil(n + numZeroes); depth++) { + int offset = 1 << depth; // pow(2, depth); + kernReductionHelper << > > (n + numZeroes, offset, dev_tidata); + // wait for cuda timer. wait for all threads to finish + cudaDeviceSynchronize(); + } + + // set last int of array to 0 + cudaMemset(&dev_tidata[n + numZeroes - 1], 0, sizeof(int)); + checkCUDAErrorWithLine("cudaMemset last int failed!"); + + // 2. down sweep + // takes dev_tidata as input + for (depth; depth >= 1; depth--) { + int offset = pow(2, depth); + kernPartialSumHelper << > > (n + numZeroes, offset, dev_tidata); + cudaDeviceSynchronize(); + } + + // copy final result to odata + cudaMemcpy(dev_odata, dev_tidata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cudaMemcpy dev_tidata to odata failed!"); + + // free all buffers + cudaFree(dev_tidata); } /** @@ -31,10 +211,198 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int fullBlocksPerArray = (n + blockSizeCompact - 1) / blockSizeCompact; + + printf("blockSize: %i \n", blockSizeCompact); + // 1. compute temp array: 1 for everything that fits rule. 0 otherwise. + int* dev_iArray; + int* dev_tempArray; + + cudaMalloc((void**)&dev_tempArray, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_tempArray failed!"); + + cudaMalloc((void**)&dev_iArray, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_iArray failed!"); + + cudaMemcpy(dev_iArray, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cudaMemcpy dev_iArray failed!"); + + int* dev_scanArray; + cudaMalloc((void**)&dev_scanArray, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_scanArray failed!"); + + // start gpu timer timer().startGpuTimer(); - // TODO + + kernMapToBoolean << > > (n, dev_tempArray, dev_iArray); + + // 2. exclusive scan on tempArray. + compactScan(n, dev_scanArray, dev_tempArray); + + // 3. scatter + // last element of numScatters is the length of scatterArray. + int numScatters = 0; + int validSlot = 0; + cudaMemcpy(&numScatters, dev_scanArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&validSlot, dev_tempArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + numScatters += validSlot; + checkCUDAErrorWithLine("cudaMemcpy numScatters failed!"); + + // unavoidable malloc + int* dev_scatterFinal; + cudaMalloc((void**)&dev_scatterFinal, numScatters * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_scatterFinal failed!"); + + kernScatter << > > (n, dev_scatterFinal, dev_iArray, dev_tempArray, dev_scanArray); + + // end gpu timer timer().endGpuTimer(); - return -1; + + // memcpy back from odata1 to odata + cudaMemcpy(odata, dev_scatterFinal, numScatters * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cudaMemcpy dev_scatterFinal to odata failed!"); + + + cudaFree(dev_iArray); + cudaFree(dev_tempArray); + cudaFree(dev_scanArray); + cudaFree(dev_scatterFinal); + + return numScatters; } } } + +namespace RadixSort { + //// todo import performance timer + + /*! Block size used for CUDA kernel launch. */ + #define blockSize 128 + + __global__ void kernMapToBoolean(int n, int* odata, int* idata, int bitpos) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx < n) { + int num = idata[idx]; + int bit = (num & (1 << bitpos)) >> bitpos; + // printf("num: %i, bitpos: %i, bit: %i \n", num, bitpos, bit); + + if (bit != 0) { + // add 1 to out array + odata[idx] = 1; + // printf("idx: %i, val: %i \n", idx, 1); + } + else { + // add 0 to outarray + odata[idx] = 0; + // printf("idx: %i, val: %i \n", idx, 0); + } + } + } + + __global__ void kernComputeScatter(int n, int bitpos, int* ddata, int* bdata, int* tdata, int* fdata, int* idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx < n) { + if (bdata[idx] == 1) { + int numToScatter = idata[idx]; + int dest = tdata[idx]; + ddata[dest] = idata[idx]; + // printf("bitpos: %i, scatter num: %i, destination: %i \n", bitpos, numToScatter, dest); + } + } + } + + __global__ void kernComputeT(int n, int* tdata, int* fdata, int totalFalses) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx < n) { + tdata[idx] = idx - fdata[idx] + totalFalses; + } + + } + + __global__ void kernComputeE(int n, int* odata, int* idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx < n) { + odata[idx] = !idata[idx]; + } + } + + void split(int n, int* odata, int* idata, int bitpos) { + // the following operations should only be done on the bit specified by bitpos. + // todo implement radix sort + int fullBlocksPerArray = (n + blockSize - 1) / blockSize; + + // 1. compute e array + int* dev_iarray; + cudaMalloc((void**)&dev_iarray, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_iarray failed!"); + cudaMemcpy(dev_iarray, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cudaMemcpy dev_iarray failed!"); + + int* dev_barray; + cudaMalloc((void**)&dev_barray, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_barray failed!"); + + // call kernMapToBoolean to calculate b + kernMapToBoolean<<>>(n, dev_barray, dev_iarray, bitpos); + + //int* tempBuf = new int[n]; + //cudaMemcpy(tempBuf, dev_barray, n * sizeof(int), cudaMemcpyDeviceToHost); + //checkCUDAErrorWithLine("cudaMempy dev_tarray failed!"); + + //for (int i = 0; i < n; i++) { + // std::cout << "t: " << bitpos << " " << tempBuf[i] << std::endl; + //} + + int* dev_earray; + cudaMalloc((void**)&dev_earray, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_earray failed!"); + + kernComputeE << > > (n, dev_earray, dev_barray); + + // 2. exclusive scan on e and store in f + int* dev_farray; + cudaMalloc((void**)&dev_farray, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_farray failed!"); + + // use compact scan because it doesn't presume the input is a host pointer. + StreamCompaction::Efficient::compactScan(n, dev_farray, dev_earray); + + // 3. compute total # falses + int fArrayLast = 0; + int eArrayLast = 0; + cudaMemcpy(&fArrayLast, dev_farray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cudaMemcpy fArrayLast failed!"); + cudaMemcpy(&eArrayLast, dev_earray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cudaMemcpy eArrayLast failed!"); + + int totalFalses = fArrayLast + eArrayLast; + + // 4. compute t array which contains the address for writing data. + int* dev_tarray; + cudaMalloc((void**)&dev_tarray, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_tarray failed!"); + + kernComputeT << > > (n, dev_tarray, dev_farray, totalFalses); + + // 5. scatter based on address d. + int* dev_scatterFinal; // final output. send to odata + cudaMalloc((void**)&dev_scatterFinal, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_scatterFinal failed!"); + + kernComputeScatter << > > (n, bitpos, dev_scatterFinal, dev_barray, dev_tarray, dev_farray, dev_iarray); + + // memcpy back to odata + cudaMemcpy(odata, dev_scatterFinal, n * sizeof(int), cudaMemcpyDeviceToHost); + + // delete[] tempBuf; + } + + void radixSort(int n, int* odata, int* idata) { + int numbits = 3; + for (int i = 0; i < numbits; i++) { + split(n, odata, idata, i); + } + } +} + diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..b408609 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -11,3 +11,11 @@ namespace StreamCompaction { int compact(int n, int *odata, const int *idata); } } + +namespace RadixSort { + + void split(int n, int* odata, int* idata, int bitpos); + + void radixSort(int n, int* odata, int* idata); +} + diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..c162fce 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,24 +2,114 @@ #include #include "common.h" #include "naive.h" +#include "iostream" + +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; + + /*! Block size used for CUDA kernel launch. */ + #define blockSizeNaive 256 + PerformanceTimer& timer() { static PerformanceTimer timer; return timer; } + // TODO: __global__ + // offset calculated from depth + __global__ void kernScanHelper(int n, int offset, int *odata, int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + if (index >= offset) { + int a = idata[index - offset]; + int b = idata[index]; + odata[index] = a + b; + } + else { + odata[index] = idata[index]; + } + } + } + + __global__ void kernPrependZero(int n, int* odata, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + if (index == 0) { + odata[index] = 0; + } + else { + odata[index] = idata[index - 1]; + } + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int fullBlocksPerArray = (n + blockSizeNaive - 1) / blockSizeNaive; + + printf("blockSize: %i \n", blockSizeNaive); + + // empty buffer as odata1 and odata2 + int* dev_odata1; + int* dev_odata2; + int* dev_odataFinal; + + cudaMalloc((void**)&dev_odata1, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata1 failed!"); + + cudaMalloc((void**)&dev_odata2, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata2 failed!"); + + // initialize new odataFinal with enough space with 1 in front + cudaMalloc((void**)&dev_odataFinal, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odataFinal failed!"); + + // copy contents of idata into odata so we can just pass in odata. + // set values of odata2 to 0s + // exclusive scan will never set the 0th index + cudaMemset(dev_odata2, 0, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMemset dev_odata2 to 0'sfailed!"); + + + // memcpy contents of idata to odata1 + cudaMemcpy(dev_odata1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cudaMemcpy idata to dev_odata1 Host to Device failed!"); + + // start gpu timer timer().startGpuTimer(); - // TODO + + // for depth from 1 to 2^d-1, for each k in parallel, invoke scan on an offset and odata. + for (int depth = 0; depth < ilog2ceil(n); depth++) { + int offset = 1 << depth; // pow(2, depth); + + kernScanHelper << > > (n, offset, dev_odata2, dev_odata1); + // wait for threads + cudaDeviceSynchronize(); + + std::swap(dev_odata2, dev_odata1); + } + + // bit shift towards the right and add 0 to front of odata, put it in odataFinal + // dev_odata1 should always have the most updated data. + kernPrependZero << > > (n, dev_odataFinal, dev_odata1); + + // end gpu timer timer().endGpuTimer(); + + // memcpy back from odata1 to odata + cudaMemcpy(odata, dev_odataFinal, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cudaMemcpy dev_odataFinal to odata failed!"); + + cudaFree(dev_odata1); + cudaFree(dev_odata2); + cudaFree(dev_odataFinal); + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..6e3e241 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -3,9 +3,12 @@ #include #include #include +#include #include "common.h" #include "thrust.h" +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + namespace StreamCompaction { namespace Thrust { using StreamCompaction::Common::PerformanceTimer; @@ -18,10 +21,48 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + int* dev_idata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_idata failed!"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cudaMemcpy dev_idata failed!"); + + int* dev_odata; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata failed!"); + + thrust::device_ptr dev_thrust_in = thrust::device_ptr(dev_idata); + thrust::device_ptr dev_thrust_out = thrust::device_ptr(dev_odata); + + // only time thrust scan + timer().startGpuTimer(); + thrust::exclusive_scan(dev_thrust_in, dev_thrust_in + n, dev_thrust_out); + timer().endGpuTimer(); + + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cudaMempcy dev_odata failed!"); + } + + void sort(int n, int* odata, int* idata) { + timer().startGpuTimer(); + + int* dev_idata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_idata failed!"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cudaMemcpy dev_idata failed!"); + + thrust::device_ptr dev_thrust_in = thrust::device_ptr(dev_idata); + + thrust::sort(dev_thrust_in, dev_thrust_in + n); + + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + timer().endGpuTimer(); } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index fe98206..994d8f0 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -7,5 +7,6 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + void sort(int n, int* odata, int* idata); } }