CIS565-Fall-2022 · cyy0915 · Sep 19, 2022
diff --git a/README.md b/README.md
@@ -3,12 +3,81 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yiyang Chen
+  * [LinkedIn](https://www.linkedin.com/in/yiyang-chen-6a7641210/), [personal website](https://cyy0915.github.io/)
+* Tested on: Windows 10, i5-8700k @ 3.7GHz, GTX 1080, personal computer
 
-### (TODO: Your README)
+## Features
+I implemented part 1 to part 5
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+For part 5, I optimized it so that when array size is large enough, efficient is always faster than naive, and naive is always faster than cpu. I did it by optimize the number of thread in up sweep and down sweep
 
+## Performance Analysis
+I have roughly optimized the block sizes. I found that 128 is ok.
+
+![](img/scan.png)
+
+![](img/compact.png)
+
+See the above graphs, notice the axis
+
+* **Phenomena:** In `scan` function, when the array length is large enough (>2^18), for runtime, cpu > naive > efficient > thrust. And in `compact` function, also when the array length is large enough, for runtime, cpu > efficient. However, when the array length is small, cpu < naive < thrust < efficient.
+
+* **Explanation:**
+  * cpu: O(n) time complexity, very normal. I think the bottleneck is computation.
+  * naive: run parallel in GPU, so it's faster than cpu implementation when the array length is large. I think when the array length is small the bottleneck is memory I/O, so it's slower than cpu.
+  * efficient: run parallel in GPU and don't need two buffers, so it's faster than naive when the array length is large. I think when the array length is small the bottleneck is also memory I/O, and it invoke more device functions compared to naive implementation, so it's even slower than naive.
+
+
+```****************
+** SCAN TESTS **
+****************
+    [  23  47  43   2   9  23  22  34  41  24  17   1   6 ...  40   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 1.4016ms    (std::chrono Measured)
+    [   0  23  70 113 115 124 147 169 203 244 268 285 286 ... 25709761 25709801 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 1.4073ms    (std::chrono Measured)
+    [   0  23  70 113 115 124 147 169 203 244 268 285 286 ... 25709686 25709735 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.76112ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.763008ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.476032ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.479264ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.191008ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.213728ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   1   3   2   1   3   2   2   3   2   3   1   0 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 2.1584ms    (std::chrono Measured)
+    [   1   1   3   2   1   3   2   2   3   2   3   1   3 ...   1   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 2.0913ms    (std::chrono Measured)
+    [   1   1   3   2   1   3   2   2   3   2   3   1   3 ...   3   3 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 5.7503ms    (std::chrono Measured)
+    [   1   1   3   2   1   3   2   2   3   2   3   1   3 ...   1   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.618016ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.636352ms    (CUDA Measured)
+    passed```
diff --git a/img/compact.png b/img/compact.png
diff --git a/img/scan.png b/img/scan.png
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -23,7 +23,10 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                bools[index] = idata[index] != 0;
+            }
         }
 
         /**
@@ -32,7 +35,12 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                if (bools[index] == 1) {
+                    odata[indices[index]] = idata[index];
+                }
+            }
         }
 
     }

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -19,7 +19,10 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -30,9 +33,15 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int oi = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] > 0) {
+                    odata[oi] = idata[i];
+                    oi++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return oi;
         }
 
         /**
@@ -42,9 +51,25 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int* bools = new int[n], * indices = new int[n];
+            for (int i = 0; i < n; i++) {
+                bools[i] = idata[i] != 0;
+            }
+            //scan
+            indices[0] = 0;
+            for (int i = 1; i < n; i++) {
+                indices[i] = indices[i - 1] + bools[i - 1];
+            }
+            for (int i = 0; i < n; i++) {
+                if (idata[i] > 0) {
+                    odata[indices[i]] = idata[i];
+                }
+            }
+            int count = indices[n - 1] + bools[n - 1];
+            delete[] bools;
+            delete[] indices;
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -12,15 +12,54 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernUpSweep(int n, int t, int* data) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                data[t * index + t - 1] += data[t * index + (t >> 1) - 1];
+            }
+        }
+        __global__ void kernDownSweep(int n, int t, int* data) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                int tmp = data[t * index + (t >> 1) - 1];
+                data[t * index + (t >> 1) - 1] = data[t * index + t - 1];
+                data[t * index + t - 1] += tmp;
+            }
+        }
+        __global__ void kernSetArray(int value, int index, int* data) {
+            data[index] = value;
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int blockSize = 128;
+            int* buffer;
+            int N = 1 << ilog2ceil(n);
+            cudaMalloc((void**)&buffer, N * sizeof(int));
+            cudaMemcpy(buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+            for (int d = 0; d < ilog2ceil(n); d++) {
+                int computeCount = n >> d + 1;  //up sweep is n, down sweep is N. It's not a typo
+                dim3 blockCount((computeCount + blockSize - 1) / blockSize);
+                kernUpSweep << <blockCount, blockSize >> > (computeCount, 1<<d+1, buffer);  //todo non power of 2
+            }
+            //cudaMemset(buffer + N - 1, 0, sizeof(int)); it cost to much
+            kernSetArray << <1, 1 >> > (0, N - 1, buffer);
+            for (int d = ilog2ceil(n) - 1; d >= 0; d--) {
+                int computeCount = N >> d + 1;
+                dim3 blockCount((computeCount + blockSize - 1) / blockSize);
+                kernDownSweep << <blockCount, blockSize >> > (computeCount, 1 << d+1, buffer);
+            }
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, buffer, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(buffer);
         }
 
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -31,10 +70,50 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int* bools, * indices, * ibuffer, * obuffer;
+            cudaMalloc((void**)&bools, n * sizeof(int));
+            cudaMalloc((void**)&indices, n * sizeof(int));
+            cudaMalloc((void**)&ibuffer, n * sizeof(int));
+            cudaMalloc((void**)&obuffer, n * sizeof(int));
+            cudaMemcpy(ibuffer, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int blockSize = 128;
+            dim3 blockCount((n + blockSize - 1) / blockSize);
+
             timer().startGpuTimer();
-            // TODO
+            Common::kernMapToBoolean<<<blockCount, blockSize>>>(n, bools, ibuffer);
+
+            //------------------ scan ---------------------
+            int N = 1 << ilog2ceil(n);
+            cudaMemcpy(indices, bools, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+            for (int d = 0; d < ilog2ceil(n); d++) {
+                int computeCount = n >> d + 1;  //up sweep is n, down sweep is N. It's not a typo
+                dim3 blockCount((computeCount + blockSize - 1) / blockSize);
+                kernUpSweep << <blockCount, blockSize >> > (computeCount, 1 << d + 1, indices);  //todo non power of 2
+            }
+            kernSetArray << <1, 1 >> > (0, N - 1, indices);
+            for (int d = ilog2ceil(n) - 1; d >= 0; d--) {
+                int computeCount = N >> d + 1;
+                dim3 blockCount((computeCount + blockSize - 1) / blockSize);
+                kernDownSweep << <blockCount, blockSize >> > (computeCount, 1 << d + 1, indices);
+            }
+            //------------------ scan ---------------------
+
+            Common::kernScatter << <blockCount, blockSize >> > (n, obuffer, ibuffer, bools, indices);
             timer().endGpuTimer();
-            return -1;
+
+            int count, bias;
+            cudaMemcpy(&count, indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&bias, bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            int remainNum = count + bias;
+            cudaMemcpy(odata, obuffer, remainNum * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(bools);
+            cudaFree(indices);
+            cudaFree(ibuffer);
+            cudaFree(obuffer);
+            return remainNum;   //todo
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
@@ -12,14 +12,42 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
+        __global__ void kernScan(int n, int bias, int* odata, const int* idata) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= bias && index < n) {
+                odata[index] = idata[index - bias] + idata[index];
+            }
+            else {
+                odata[index] = idata[index];
+            }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int* odata, const int* idata) {
+            int blockSize = 128;
+            dim3 blockCount((n + blockSize - 1) / blockSize);
+
+            int* buffer1, *buffer2;
+            cudaMalloc((void**)&buffer1, n * sizeof(int));
+            cudaMalloc((void**)&buffer2, n * sizeof(int));
+            cudaMemcpy(buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+            int in = ilog2ceil(n);
+            for (int d = 1; d <= in; d++) {
+                int bias = 1 << d - 1;
+                kernScan<<<blockCount, blockSize>>>(n, bias, buffer2, buffer1);
+
+                std::swap(buffer1, buffer2);
+            }
             timer().endGpuTimer();
+
+            cudaMemcpy(odata + 1, buffer1, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+            odata[0] = 0;
+            cudaFree(buffer1);
+            cudaFree(buffer2);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
@@ -18,11 +18,19 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            thrust::device_vector<int> thrust_idata(idata, idata + n);
+            thrust::device_vector<int> thrust_odata(n);
+
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+            thrust::exclusive_scan(thrust_idata.begin(), thrust_idata.end(), thrust_odata.begin());
+
             timer().endGpuTimer();
+
+            thrust::copy(thrust_odata.begin(), thrust_odata.end(), odata);
         }
     }
 }