diff --git a/README.md b/README.md index 0e38ddb..59f0a2a 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,116 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Shineng Tang + * [LinkedIn](https://www.linkedin.com/in/shineng-t-224192195/) +* Tested on: Windows 11, i9-10900k @3.7GHz 32GB, RTX 3090 24GB -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +# Project Description +This project implements a few different versions of the **Scan** (_Prefix Sum_) algorithm and stream compaction in CUDA. +![](img/Stream-compaction.png) + + +## Main Features: +* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. +* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using + the `scan` function. +* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` + function. +* `StreamCompaction::Naive::scan`: A naive parallel GPU scan +* `StreamCompaction::Efficient::scan`: A **work-efficient** parallel GPU scan using _upsweep_ and _downsweep_ +* `StreamCompaction::Thrust::scan`: A short function which wraps a call to the **Thrust** library +* `StreamCompaction::Efficient::compact`: A string compaction funtion in CUDA + +## Extra Credit Features: +* `StreamCompaction::Efficient::radixSort`: I implemented CUDA based radix sort. It is noticably faster than `std::sort` when dealing with large size array. The test cases are shown at the bottom of the **Test Outputs** below. +* Threads optimization: By rearranging the usage of the threads, and reducing the blockcount when doing upsweep and downsweep, the performance increases dramatically. I set a macro to toggle the thread-optimization mode. + + +# Test Outputs +``` + +**************** +** SCAN TESTS ** +**************** + [ 47 21 28 13 2 26 1 14 4 49 25 20 43 ... 15 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 2.729ms (std::chrono Measured) + [ 0 47 68 96 109 111 137 138 152 156 205 230 250 ... 51332490 51332505 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 2.7305ms (std::chrono Measured) + [ 0 47 68 96 109 111 137 138 152 156 205 230 250 ... 51332413 51332419 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 19.9673ms (CUDA Measured) + [ 0 47 68 96 109 111 137 138 152 156 205 230 250 ... 51332490 51332505 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 18.4846ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.527424ms (CUDA Measured) + [ 0 47 68 96 109 111 137 138 152 156 205 230 250 ... 51332490 51332505 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.519136ms (CUDA Measured) + [ 0 47 68 96 109 111 137 138 152 156 205 230 250 ... 51332413 51332419 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.51376ms (CUDA Measured) + [ 0 47 68 96 109 111 137 138 152 156 205 230 250 ... 51332490 51332505 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.469344ms (CUDA Measured) + [ 0 47 68 96 109 111 137 138 152 156 205 230 250 ... 51332413 51332419 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 3 0 1 0 2 1 0 0 1 1 0 1 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 4.1334ms (std::chrono Measured) + [ 1 3 1 2 1 1 1 1 3 3 2 2 2 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 4.115ms (std::chrono Measured) + [ 1 3 1 2 1 1 1 1 3 3 2 2 2 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 6.5739ms (std::chrono Measured) + [ 1 3 1 2 1 1 1 1 3 3 2 2 2 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.602784ms (CUDA Measured) + [ 1 3 1 2 1 1 1 1 3 3 2 2 2 ... 2 3 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.604032ms (CUDA Measured) + [ 1 3 1 2 1 1 1 1 3 3 2 2 2 ... 2 1 ] + passed + +***************************** +** RADIX SORT TESTS ** +***************************** +==== radix sort, power-of-two ==== + elapsed time: 6.89866ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] + elapsed time: 34.7704ms (std::chrono Measured) + passed +==== radix sort, non-power-of-two ==== + elapsed time: 6.59078ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] + elapsed time: 35.2076ms (std::chrono Measured) + passed +``` +# Performance Analysis + +![](img/comparison1.png) + +For the **scan algorithm**, I notice that when dealing with relatively small-sized arrays, the cpu version is slightly faster than any gpu implementation, even Thrust. When I increase the size of the array, for example, at array size 2^14, the supposedly faster implementation is slower than any other ones. However, when the array size reaches a bigger number 2^22, the performance of the work-efficent scan is already fairly close to the thrust function. Another thing I notice is that the naive GPU scan does not surpass the CPU scan until approximately 2^19. This is because of the usage of global memory and no threads optimization which leads to divergency. + +![](img/comparison2.png) + +For the **stream compaction**, the pattern of the chart is similar to the scan function. When dealing with large-sized data, GPU is always faster. \ No newline at end of file diff --git a/img/Stream-compaction.png b/img/Stream-compaction.png new file mode 100644 index 0000000..61fc07a Binary files /dev/null and b/img/Stream-compaction.png differ diff --git a/img/comparison1.png b/img/comparison1.png new file mode 100644 index 0000000..8ff9a20 Binary files /dev/null and b/img/comparison1.png differ diff --git a/img/comparison2.png b/img/comparison2.png new file mode 100644 index 0000000..592d11a Binary files /dev/null and b/img/comparison2.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..595fa22 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,9 +11,10 @@ #include #include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 21; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,7 +52,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -71,28 +72,28 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,16 +138,49 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + memcpy(b, a, SIZE * sizeof(int)); + /*for (int i = 0; i < SIZE; i++) { + b[i] = a[i]; + }*/ + // printArray(SIZE, a, true); + zeroArray(SIZE, c); + printDesc("radix sort, power-of-two"); + StreamCompaction::Efficient::radixSort(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + StreamCompaction::CPU::sort(SIZE, b); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + zeroArray(SIZE, b); + memcpy(b, a, NPOT * sizeof(int)); + printDesc("radix sort, non-power-of-two"); + StreamCompaction::Efficient::radixSort(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + StreamCompaction::CPU::sort(NPOT, b); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printCmpResult(NPOT, b, c); + + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..d845250 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,13 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + bools[index] = idata[index] == 0 ? 0 : 1; + } /** @@ -33,6 +40,13 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..57f97ec 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -9,6 +9,7 @@ #include #include #include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..fcbb505 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -31,8 +35,14 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return count; } /** @@ -43,8 +53,32 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int* temp = new int[n]; + for (int i = 0; i < n; i++) { + temp[i] = idata[i] == 0 ? 0 : 1; + } + //scan result + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + temp[i - 1]; + } + //int count = odata[n - 1]; + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[odata[i]] = idata[i]; + count++; + } + } + timer().endCpuTimer(); + delete temp; + return count; + } + + void sort(int n, int* idata) { + timer().startCpuTimer(); + std::sort(idata, idata + n); timer().endCpuTimer(); - return -1; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..9ca5664 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -11,5 +11,7 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + void sort(int n, int* idata); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..a5030f9 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -1,8 +1,13 @@ #include #include +#include #include "common.h" #include "efficient.h" + +#define blockSize 128 +#define THREAD_OPTIMIZATION 1; + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +17,101 @@ namespace StreamCompaction { return timer; } - /** + __global__ void kernUpSweep(int n, int d, int* data){ + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= n) { + return; + } + +#if THREAD_OPTIMIZATION + int stride = 1 << (d + 1); + k = (k + 1) * stride - 1; + data[k] += data[k - (stride >> 1)]; +#else + int stride = 1 << (d + 1); + if (k % stride == 0) { + data[k + stride - 1] += data[k + (1 << d) - 1]; + } +#endif + + + + + } + + __global__ void kernDownSweep(int n, int d, int* data) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= n) { + return; + } + + +#if THREAD_OPTIMIZATION + int stride = 1 << (d + 1); + k = (k + 1) * stride - 1; + int t = data[k - (stride >> 1)]; + data[k - (stride >> 1)] = data[k]; + data[k] += t; +#else + int stride = 1 << (d + 1); + int pow_2 = 1 << d; + if (k % stride == 0) { + int t = data[k + pow_2 - 1]; + data[k + pow_2 - 1] = data[k + stride - 1]; + data[k + stride - 1] = t + data[k + stride - 1]; + } +#endif + } + + void perfixSumScan(int size, int* idata) { + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + int threads = size; + for (int d = 0; d < ilog2ceil(size); d++) { +#if THREAD_OPTIMIZATION + dim3 blocksPerGrid((threads + blockSize - 1) / blockSize); + threads /= 2; + kernUpSweep <<< blocksPerGrid, blockSize >> > (threads, d, idata); + +#else + kernUpSweep << < fullBlocksPerGrid, blockSize >> > (size, d, idata); +#endif + } + //set last element to 0 + + int val = 0; + cudaMemcpy(&idata[size - 1], &val, sizeof(int), cudaMemcpyHostToDevice); + + //threads is already 1 + for (int d = ilog2ceil(size) - 1; d >= 0; d--) { +#if THREAD_OPTIMIZATION + dim3 blocksPerGrid((threads + blockSize - 1) / blockSize); + kernDownSweep << < blocksPerGrid, blockSize >> > (threads, d, idata); + threads *= 2; +#else + kernDownSweep << < fullBlocksPerGrid, blockSize >> > (size, d, idata); +#endif + } + } + /**uyb * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + int* dev_data; + + //for non-pow2 + int size = 1 << ilog2ceil(n); + cudaMalloc((void**)&dev_data, size * sizeof(int)); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + + timer().startGpuTimer(); // TODO + perfixSumScan(size, dev_data); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_data); } /** @@ -31,10 +124,124 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int* dev_bool; + int* dev_idata; + int* dev_odata; + int* dev_scanResult; + + int count = 0; + int lastBool = 0; + + int size = 1 << ilog2ceil(n); + + cudaMalloc((void**)&dev_bool, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_scanResult, size * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + Common::kernMapToBoolean << > > (n, dev_bool, dev_idata); + cudaMemcpy(dev_scanResult, dev_bool, n * sizeof(int), cudaMemcpyDeviceToDevice); + perfixSumScan(size, dev_scanResult); + Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bool, dev_scanResult); timer().endGpuTimer(); - return -1; + + //last boolean is not counted in exclusive scan, if last bool is 1, need to take this into account + //This is not shown in the slides + cudaMemcpy(&count, dev_scanResult + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastBool, dev_bool + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + int length = count + lastBool; + cudaMemcpy(odata, dev_odata, sizeof(int) * length, cudaMemcpyDeviceToHost); + + cudaFree(dev_bool); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_scanResult); + + return length; + } + + + __global__ void kernNegate(int n, int k, int* data, int* bools) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + int curBit = (data[index] & (1 << k)) >> k; + bools[index] = curBit == 1 ? 0 : 1; + + } + + __global__ void kernSplit(int n, int k, int totalFalses, int* scan, int* idata, int* odata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + int cur = idata[index]; + int curBit = (cur & (1 << k)) >> k; + int scanIdx = scan[index]; + // odata[index] = curBit == 0 ? idata[scanIdx] : idata[index - scanIdx + totalFalses]; + if (curBit == 0) { + odata[scanIdx] = cur; + } else { + odata[index - scanIdx + totalFalses] = cur; + } + + } + + + void radixSort(int n, int* odata, const int* idata) { + + int* dev_idata; + int* dev_odata; + int* dev_scanResult; + int* dev_bool; + + int size = 1 << ilog2ceil(n); + int lastBool = 0; + int lastCount = 0; + int totalFalses = 0; + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_bool, n * sizeof(int)); + cudaMalloc((void**)&dev_scanResult, size * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + int max = *std::max_element(idata, idata + n); + int numBits = ilog2ceil(max); + + for (int i = 0; i < numBits; i++) { + kernNegate << > > (n, i, dev_idata, dev_bool); + cudaMemcpy(&lastBool, dev_bool + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(dev_scanResult, dev_bool, n * sizeof(int), cudaMemcpyDeviceToDevice); + perfixSumScan(size, dev_scanResult); + cudaMemcpy(&lastCount, dev_scanResult + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + totalFalses = lastBool + lastCount; + kernSplit << > > (n, i, totalFalses, dev_scanResult, dev_idata, dev_odata); + std::swap(dev_idata, dev_odata); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bool); + cudaFree(dev_scanResult); } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..8efb4f4 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -9,5 +9,7 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); + + void radixSort(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..77f5a02 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,10 @@ #include "common.h" #include "naive.h" + + +#define blockSize 128 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +16,48 @@ namespace StreamCompaction { return timer; } // TODO: __global__ - + __global__ void kernScan(int n, int d, int* odata, const int* idata) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= n) { + return; + } + int stride = 1 << (d - 1); + if (k >= stride) { + odata[k] = idata[k - stride] + idata[k]; + } + else { + odata[k] = idata[k]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ + */ void scan(int n, int *odata, const int *idata) { + int* dev_idata; + int* dev_odata; + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + // cudaMemcpy(dev_odata, odata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + for (int d = 1; d <= ilog2ceil(n); d++) { + kernScan << > > (n, d, dev_odata, dev_idata); + std::swap(dev_odata, dev_idata); + } timer().endGpuTimer(); + //convert from inclusive scan to exclusive scan + odata[0] = 0; + cudaMemcpy(odata + 1, dev_idata, sizeof(int) * (n - 1), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..ed16d1e 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,17 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector host_data(idata, idata + n); + thrust::device_vector dev_in(host_data); + thrust::device_vector dev_out(host_data); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_out.data().get(), sizeof(int) * n, cudaMemcpyDeviceToHost); } } }