diff --git a/README.md b/README.md index 0e38ddb..b26ef7c 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,84 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) +* Ryan Tong + * [LinkedIn](https://www.linkedin.com/in/ryanctong/), [personal website](), [twitter](), etc. +* Tested on: Windows 10, i7-8750H @ 2.20GHz 16GB, GeForce GTX 1060 6144MB (Personal Laptop) Include analysis, etc. (Remember, this is public, so don't put anything here that you don't want to share with the world.) +### Features +This project implements the required features of CPU based scan and stream compaction; GPU based naive scan, work-efficient scan, stream compaction (using work-efficient scan); and Thrust based scan. I roughly optimized the block size to be 256 after doing some testing and reading this article: https://oneflow2020.medium.com/how-to-choose-the-grid-size-and-block-size-for-a-cuda-kernel-d1ff1f0a7f92. + +### Performance Analysis +Here are the graphs comapring the runtimes of scan implemented on the CPU, GPU, and with Thrust. Note that Thrust is removed and the range of array sizes is shrunk for the second graph for visualization purposes. + +![Scan](img/scan.png) +![Scan (Better Visualization)](img/scan_small.png) + +From the graphs, we can see that CPU is faster than the work-efficent GPU implementation until the array size reaches about ~1,000,000 elements. This is suprising because the theortical complexities of these algorithms are O(n), O(nlogn), O(n) for CPU, naive, and work efficent respectively. Since the GPU implementations are paralleized we would expect that they are faster than the CPU implementation. The cause of this is likely the lack of optimizations in my GPU code and frequent reads and writes to global memory which is slow. An implementation using shared memory would improve the memory access speeds. Further more, the indexing of scan is inefficent since there are many inactive threads that could be retired in a warp if they were consecutive. + +The Thrust implementations are significantly slower than both GPU and CPU implementation which is likely due to some implementation error that I was unable to solve. + +We can see these inefficenies reflected again in the stream compaction run times: + +![Stream Compaction](img/compaction.png) +![Stream Compaction (Better Visualization)](img/compaction_small.png) + +### Program Output +``` + +**************** +** SCAN TESTS ** +**************** + [ 29 48 7 23 28 16 45 34 2 47 35 3 16 ... 48 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 2.702ms (std::chrono Measured) + [ 0 29 77 84 107 135 151 196 230 232 279 314 317 ... 12845931 12845979 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 2.7096ms (std::chrono Measured) + [ 0 29 77 84 107 135 151 196 230 232 279 314 317 ... 12845838 12845880 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 4.85891ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 4.5247ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 2.23603ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 2.10493ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 35.5277ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 27.5845ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 2 3 1 2 0 3 0 2 3 1 3 2 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 6.4836ms (std::chrono Measured) + [ 3 2 3 1 2 3 2 3 1 3 2 3 1 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 5.3097ms (std::chrono Measured) + [ 3 2 3 1 2 3 2 3 1 3 2 3 1 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 14.7061ms (std::chrono Measured) + [ 3 2 3 1 2 3 2 3 1 3 2 3 1 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 2.84058ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 2.50528ms (CUDA Measured) + passed +``` diff --git a/img/compaction.png b/img/compaction.png new file mode 100644 index 0000000..248b095 Binary files /dev/null and b/img/compaction.png differ diff --git a/img/compaction_small.png b/img/compaction_small.png new file mode 100644 index 0000000..0e1d672 Binary files /dev/null and b/img/compaction_small.png differ diff --git a/img/scan.png b/img/scan.png new file mode 100644 index 0000000..cafbad8 Binary files /dev/null and b/img/scan.png differ diff --git a/img/scan_small.png b/img/scan_small.png new file mode 100644 index 0000000..a83aea8 Binary files /dev/null and b/img/scan_small.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..ac0ccc8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 5; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..18cc27e 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -1,3 +1,4 @@ +cmake_minimum_required(VERSION 3.1) set(headers "common.h" "cpu.h" diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..ea8a869 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,12 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -32,7 +37,12 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n || bools[index] == 0) { + return; + } + + odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..5f88cfd 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -9,6 +9,9 @@ #include #include #include +#include "device_launch_parameters.h" + +#define blockSize 4 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..37aa00c 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,11 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int prev = 0; + for (int i = 0; i < n; ++i) { + odata[i] = prev; + prev += idata[i]; + } timer().endCpuTimer(); } @@ -30,9 +34,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int out_ptr = 0; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[out_ptr] = idata[i]; + ++out_ptr; + } + } timer().endCpuTimer(); - return -1; + return out_ptr; } /** @@ -42,9 +52,31 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int* binary = new int[n]; + int* scanOut = new int[n]; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + binary[i] = 1; + } + else { + binary[i] = 0; + } + } + //Scan code copied + int prev = 0; + for (int i = 0; i < n; ++i) { + scanOut[i] = prev; + prev += binary[i]; + } + int count = 0; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[scanOut[i]] = idata[i]; + ++count; + } + } timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..36b47dc 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -11,14 +11,190 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + __global__ void kernUpSweep(int n, int d, int* idata) { + // Parallel Reduction + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + int k = index * (1 << (d + 1)); + idata[k + (1 << (d + 1)) - 1] += idata[k + (1 << d) - 1]; + } + + __global__ void kernDownSweep(int n, int d, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + int k = index * (1 << (d + 1)); + int t = idata[k + (1 << d) - 1]; + idata[k + (1 << d) - 1] = idata[k + (1 << (d + 1)) - 1]; + idata[k + (1 << (d + 1)) - 1] += t; + } + + __host__ __device__ int copyIlog2(int x) { //copied the given functions bc i am lazy + int lg = 0; + while (x >>= 1) { + ++lg; + } + return lg; + } + + __host__ __device__ int copyIlog2ceil(int x) { + return x == 1 ? 0 : copyIlog2(x - 1) + 1; + } + + + // Steps for shared scan + // 1. Launch kernel with N / (blockSize * 2) blocks, blockSize threads per blockSize + // 2. For each block generate a shared mem size 2 * blockSize + // 3. Load values from input array in pairs to shared mem + // 4. Do same indexing upsweep scheme as before on individual blocks + // 5. Get INCLUSIVE element for each block endand add to temp array + // 6. Zero out last element of each block, like with root zeroing + // 7. Down sweep on individual blocks + // OR DO REST ON CPU :) + // 8. Pass the temp array as the new input array and recurse steps 1 - 7 + // 9. Recursively add the output of the temp array as an offset to each block + __global__ void kernSharedScan(int n, int* idata, int* temp) { + // Parallel Reduction w/ shared memory + // Shared memory should be 2 * blockSize + __shared__ int partialSum[2 * blockSize]; + // Load input memory into shared memory in pairs + int index = threadIdx.x + (blockIdx.x * blockDim.x); //index of all launched threads (N / 2) + int sharedIdx = threadIdx.x; // per block index + partialSum[sharedIdx * 2] = idata[index * 2]; + partialSum[sharedIdx * 2 + 1] = idata[index * 2 + 1]; + // Per block upsweep + int logBlock = copyIlog2ceil(blockDim.x * 2); //blockSize * 2 since we are doing blockSize*2 elements per block + for (int d = 0; d < logBlock; ++d) { // Runs log2(blockSize) times + __syncthreads(); + if (sharedIdx < (blockDim.x / (1 << d))) { + int k = sharedIdx * (1 << (d + 1)); + partialSum[k + (1 << (d + 1)) - 1] += partialSum[k + (1 << d) - 1]; + } + } + __syncthreads(); + // Save last INCLUSIVE VALUE of block (for recursion and offset) + // Zero out root + if (sharedIdx == blockDim.x - 1) { // Last thread in block + temp[blockIdx.x] = partialSum[2 * blockDim.x - 1]; //+ idata[(2 * blockDim.x - 1) + blockIdx.x * blockDim.x * 2]; // Last element in shared mem + last element of block in idata (last inclusive element) + partialSum[2 * blockDim.x - 1] = 0; + } + __syncthreads(); + // Per block downsweep + for (int d = logBlock - 1; d >= 0; --d) { + if (sharedIdx < (blockDim.x / (1 << d))) { + int k = sharedIdx * (1 << (d + 1)); + int t = partialSum[k + (1 << d) - 1]; + partialSum[k + (1 << d) - 1] = partialSum[k + (1 << (d + 1)) - 1]; + partialSum[k + (1 << (d + 1)) - 1] += t; + } + } + __syncthreads(); + //Write to input array in place + idata[index * 2] = partialSum[sharedIdx * 2]; + idata[index * 2 + 1] = partialSum[sharedIdx * 2 + 1]; + } + + // Function to add offset buffer to each block + // ex. offset = [10, 20, 30], add 10 to block 0, add 20 to block 1, add 30 to block 2 + __global__ void addOffsets(int n, int* idata, int* offsets) { + // n is num elements in idata + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > n) { + return; + } + idata[index] += offsets[(int)index / (blockDim.x * 2)]; + } + + void scan(int n, int* odata, const int* idata) { + int paddedN = (1 << ilog2ceil(n)); + int* dev_idata; + cudaMalloc((void**)&dev_idata, paddedN * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(dev_idata + n, 0, (paddedN - n) * sizeof(int)); + cudaDeviceSynchronize(); + + timer().startGpuTimer(); + //Determine size of temp array after 1 pass + int tempSize = paddedN / (blockSize * 2); + dim3 gridSize(tempSize); + int* dev_temp; + int* temp = (int*) malloc(tempSize * sizeof(int)); + cudaMalloc((void**)& dev_temp, tempSize * sizeof(int)); + cudaDeviceSynchronize(); + kernSharedScan << > > (paddedN, dev_idata, dev_temp); + checkCUDAError("kernSharedScan failed!"); + cudaDeviceSynchronize(); + cudaMemcpy(temp, dev_temp, tempSize * sizeof(int), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + int prev = 0; + for (int i = 0; i < tempSize; ++i) { // In-place CPU exclusive scan + int tempVal = temp[i]; + temp[i] = prev; + prev += tempVal; + } + + cudaMemcpy(dev_temp, temp, tempSize * sizeof(int), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + dim3 offsetGridSize(paddedN / blockSize); + addOffsets << > > (paddedN, dev_idata, dev_temp); + checkCUDAError("addOffsets failed!"); + cudaDeviceSynchronize(); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_temp); + free(temp); + } + + __global__ void kernZeroRoot(int n, int* idata) { + // Root is last element + idata[n - 1] = 0; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void oldScan(int n, int *odata, const int *idata) { + // Account for non-powers of 2 by padding by 0 + int paddedN = (1 << ilog2ceil(n)); + int* dev_idata; + cudaMalloc((void**)&dev_idata, paddedN * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(dev_idata + n, 0, (paddedN - n) * sizeof(int)); + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + // Upsweep + for (int i = 0; i < ilog2ceil(n); ++i) { + int numThreads = paddedN / (1 << (i + 1)); + dim3 upSweepGridSize((numThreads + blockSize - 1) / blockSize); + kernUpSweep << > > + (numThreads, i, dev_idata); + checkCUDAError("kernUpSweep failed!"); + cudaDeviceSynchronize(); + } + + // Downsweep + kernZeroRoot << <1, 1 >> > (paddedN, dev_idata); + for (int i = ilog2ceil(n) - 1; i >= 0; --i) { + int numThreads = paddedN / (1 << (i + 1)); + dim3 downSweepGridSize((numThreads + blockSize - 1) / blockSize); + kernDownSweep << > > + (numThreads, i, dev_idata); + checkCUDAError("kernDownSweep failed!"); + cudaDeviceSynchronize(); + } timer().endGpuTimer(); + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); } /** @@ -31,10 +207,104 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + // Account for non-powers of 2 by padding by 0 + int paddedN = (1 << ilog2ceil(n)); + int* dev_idata; + int* dev_odata; + int* dev_bool; + int* dev_indices; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // Pad bool array instead of idata to save operations in kernMapToBoolean + cudaMalloc((void**)&dev_bool, paddedN * sizeof(int)); + checkCUDAError("cudaMalloc dev_bool failed!"); + cudaMemset(dev_bool + n, 0, (paddedN - n) * sizeof(int)); + + cudaMalloc((void**)&dev_indices, paddedN * sizeof(int)); + checkCUDAError("cudaMalloc dev_indices failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_indices failed!"); + cudaDeviceSynchronize(); + + + //Determine size of temp array after 1 pass + int tempSize = paddedN / (blockSize * 2); + dim3 gridSize(tempSize); + int* dev_temp; + int* temp = (int*)malloc(tempSize * sizeof(int)); + cudaMalloc((void**)&dev_temp, tempSize * sizeof(int)); + checkCUDAError("cudaMalloc dev_temp failed!"); + timer().startGpuTimer(); - // TODO + // Binarize + dim3 nGridSize((n + blockSize - 1) / blockSize); + StreamCompaction::Common::kernMapToBoolean << < nGridSize, blockSize >> > + (n, dev_bool, dev_idata); + checkCUDAError("kernMapToBoolean failed!"); + cudaDeviceSynchronize(); + // We need bool array for scatter so copy bool result to indices to be modified in place + cudaMemcpy(dev_indices, dev_bool, paddedN * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy failed!"); + cudaDeviceSynchronize(); + + // Shared scan copied from above + kernSharedScan << > > (paddedN, dev_indices, dev_temp); + checkCUDAError("kernSharedScan failed!"); + cudaDeviceSynchronize(); + cudaMemcpy(temp, dev_temp, tempSize * sizeof(int), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + int prev = 0; + for (int i = 0; i < tempSize; ++i) { // In-place CPU exclusive scan + int tempVal = temp[i]; + temp[i] = prev; + prev += tempVal; + } + cudaMemcpy(dev_temp, temp, tempSize * sizeof(int), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + dim3 offsetGridSize(paddedN / blockSize); + addOffsets << > > (paddedN, dev_indices, dev_temp); + checkCUDAError("addOffsets failed!"); + cudaDeviceSynchronize(); + //// Copied Scan code from above + //// Upsweep + //for (int i = 0; i < ilog2ceil(n); ++i) { + // int numThreads = paddedN / (1 << (i + 1)); + // dim3 upSweepGridSize((numThreads + blockSize - 1) / blockSize); + // kernUpSweep << > > + // (numThreads, i, dev_indices); + // checkCUDAError("kernUpSweep failed!"); + // cudaDeviceSynchronize(); + //} + + //// Downsweep + //kernZeroRoot << <1, 1 >> > (paddedN, dev_indices); + //for (int i = ilog2ceil(n) - 1; i >= 0; --i) { + // int numThreads = paddedN / (1 << (i + 1)); + // dim3 downSweepGridSize((numThreads + blockSize - 1) / blockSize); + // kernDownSweep << > > + // (numThreads, i, dev_indices); + // checkCUDAError("kernDownSweep failed!"); + // cudaDeviceSynchronize(); + //} + + // Scatter + StreamCompaction::Common::kernScatter << > > + (n, dev_odata, dev_idata, dev_bool, dev_indices); + checkCUDAError("kernScatter failed!"); + cudaDeviceSynchronize(); timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_indices, paddedN * sizeof(int), cudaMemcpyDeviceToHost); + int finalNum = odata[paddedN - 1]; + cudaMemcpy(odata, dev_odata, finalNum * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_bool); + cudaFree(dev_indices); + cudaFree(dev_odata); + return finalNum; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..e49a86a 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,14 +12,69 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernScan(int n, int d, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (index >= (1 << (d - 1))) { + odata[index] = idata[index - (1 << (d - 1))] + idata[index]; + } + else { + odata[index] = idata[index]; + } + } + + __global__ void kernInclusiveToExclusive(int n, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (index == 0) { + odata[index] = 0; + return; + } + odata[index] = idata[index - 1]; + } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ + + ///** + // * Performs prefix-sum (aka scan) on idata, storing the result into odata. + // */ void scan(int n, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // Put host arrays onto device + int* dev_odata; + int* dev_idata; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMemcpy(dev_odata, odata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + for (int i = 1; i <= ilog2ceil(n); ++i) { + kernScan << > > + (n, i, dev_odata, dev_idata); + checkCUDAError("kernScan failed!"); + cudaDeviceSynchronize(); + int* temp = dev_idata; + dev_idata = dev_odata; + dev_odata = temp; + } + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + kernInclusiveToExclusive << > > (n, dev_odata, dev_idata); + checkCUDAError("kernInclusiveToExclusive failed!"); + cudaDeviceSynchronize(); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_odata); + cudaFree(dev_idata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..a4f3c39 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + thrust::host_vector thrust_idata(idata, idata + n); + thrust::host_vector thrust_odata(odata, odata + n); + + thrust::device_vector dev_thrust_idata = thrust_idata; + thrust::device_vector dev_thrust_odata = thrust_odata; + cudaDeviceSynchronize(); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_thrust_idata.begin(), dev_thrust_idata.end(), dev_thrust_odata.begin()); timer().endGpuTimer(); + + thrust::copy(dev_thrust_odata.begin(), dev_thrust_odata.end(), odata); } } }