diff --git a/README.md b/README.md
index 0e38ddb..59f0a2a 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,116 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Shineng Tang
+  * [LinkedIn](https://www.linkedin.com/in/shineng-t-224192195/)
+* Tested on: Windows 11, i9-10900k @3.7GHz 32GB, RTX 3090 24GB
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+# Project Description
+This project implements a few different versions of the **Scan** (_Prefix Sum_) algorithm and stream compaction in CUDA.
 
+![](img/Stream-compaction.png)
+
+
+## Main Features:
+* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum.
+* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using
+  the `scan` function.
+* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan`
+  function. 
+* `StreamCompaction::Naive::scan`: A naive parallel GPU scan
+* `StreamCompaction::Efficient::scan`: A **work-efficient** parallel GPU scan using _upsweep_ and _downsweep_
+* `StreamCompaction::Thrust::scan`: A short function which wraps a call to the **Thrust** library
+* `StreamCompaction::Efficient::compact`: A string compaction funtion in CUDA
+
+## Extra Credit Features:
+* `StreamCompaction::Efficient::radixSort`: I implemented CUDA based radix sort. It is noticably faster than `std::sort` when dealing with large size array. The test cases are shown at the bottom of the **Test Outputs** below.
+* Threads optimization: By rearranging the usage of the threads, and reducing the blockcount when doing upsweep and downsweep, the performance increases dramatically. I set a macro to toggle the thread-optimization mode.
+
+
+# Test Outputs 
+```
+
+****************
+** SCAN TESTS **
+****************
+    [  47  21  28  13   2  26   1  14   4  49  25  20  43 ...  15   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 2.729ms    (std::chrono Measured)
+    [   0  47  68  96 109 111 137 138 152 156 205 230 250 ... 51332490 51332505 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 2.7305ms    (std::chrono Measured)
+    [   0  47  68  96 109 111 137 138 152 156 205 230 250 ... 51332413 51332419 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 19.9673ms    (CUDA Measured)
+    [   0  47  68  96 109 111 137 138 152 156 205 230 250 ... 51332490 51332505 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 18.4846ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.527424ms    (CUDA Measured)
+    [   0  47  68  96 109 111 137 138 152 156 205 230 250 ... 51332490 51332505 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.519136ms    (CUDA Measured)
+    [   0  47  68  96 109 111 137 138 152 156 205 230 250 ... 51332413 51332419 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.51376ms    (CUDA Measured)
+    [   0  47  68  96 109 111 137 138 152 156 205 230 250 ... 51332490 51332505 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.469344ms    (CUDA Measured)
+    [   0  47  68  96 109 111 137 138 152 156 205 230 250 ... 51332413 51332419 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   3   0   1   0   2   1   0   0   1   1   0   1 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 4.1334ms    (std::chrono Measured)
+    [   1   3   1   2   1   1   1   1   3   3   2   2   2 ...   2   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 4.115ms    (std::chrono Measured)
+    [   1   3   1   2   1   1   1   1   3   3   2   2   2 ...   2   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 6.5739ms    (std::chrono Measured)
+    [   1   3   1   2   1   1   1   1   3   3   2   2   2 ...   2   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.602784ms    (CUDA Measured)
+    [   1   3   1   2   1   1   1   1   3   3   2   2   2 ...   2   3 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.604032ms    (CUDA Measured)
+    [   1   3   1   2   1   1   1   1   3   3   2   2   2 ...   2   1 ]
+    passed
+
+*****************************
+** RADIX SORT TESTS **
+*****************************
+==== radix sort, power-of-two ====
+   elapsed time: 6.89866ms    (CUDA Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...  49  49 ]
+   elapsed time: 34.7704ms    (std::chrono Measured)
+    passed
+==== radix sort, non-power-of-two ====
+   elapsed time: 6.59078ms    (CUDA Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...  49  49 ]
+   elapsed time: 35.2076ms    (std::chrono Measured)
+    passed
+```
+# Performance Analysis
+
+![](img/comparison1.png)
+
+For the **scan algorithm**, I notice that when dealing with relatively small-sized arrays, the cpu version is slightly faster than any gpu implementation, even Thrust. When I increase the size of the array, for example, at array size 2^14, the supposedly faster implementation is slower than any other ones. However, when the array size reaches a bigger number 2^22, the performance of the work-efficent scan is already fairly close to the thrust function. Another thing I notice is that the naive GPU scan does not surpass the CPU scan until approximately 2^19. This is because of the usage of global memory and no threads optimization which leads to divergency.
+
+![](img/comparison2.png)
+
+For the **stream compaction**, the pattern of the chart is similar to the scan function. When dealing with large-sized data, GPU is always faster.
\ No newline at end of file
diff --git a/img/Stream-compaction.png b/img/Stream-compaction.png
new file mode 100644
index 0000000..61fc07a
Binary files /dev/null and b/img/Stream-compaction.png differ
diff --git a/img/comparison1.png b/img/comparison1.png
new file mode 100644
index 0000000..8ff9a20
Binary files /dev/null and b/img/comparison1.png differ
diff --git a/img/comparison2.png b/img/comparison2.png
new file mode 100644
index 0000000..592d11a
Binary files /dev/null and b/img/comparison2.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..595fa22 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,9 +11,10 @@
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <stream_compaction/common.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 21; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,7 +52,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -71,28 +72,28 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -137,16 +138,49 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
+    printf("\n");
+    printf("*****************************\n");
+    printf("** RADIX SORT TESTS **\n");
+    printf("*****************************\n");
+
+    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    a[SIZE - 1] = 0;
+    memcpy(b, a, SIZE * sizeof(int));
+    /*for (int i = 0; i < SIZE; i++) {
+        b[i] = a[i];
+    }*/
+   // printArray(SIZE, a, true);
+    zeroArray(SIZE, c);
+    printDesc("radix sort, power-of-two");
+    StreamCompaction::Efficient::radixSort(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(SIZE, c, true);    
+    StreamCompaction::CPU::sort(SIZE, b);   
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, c);
+    zeroArray(SIZE, b);
+    memcpy(b, a, NPOT * sizeof(int));
+    printDesc("radix sort, non-power-of-two");
+    StreamCompaction::Efficient::radixSort(NPOT, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(NPOT, c, true);
+    StreamCompaction::CPU::sort(NPOT, b);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    printCmpResult(NPOT, b, c);
+
+
     system("pause"); // stop Win32 console from closing on exit
     delete[] a;
     delete[] b;
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..d845250 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,13 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            bools[index] = idata[index] == 0 ? 0 : 1;
+
         }
 
         /**
@@ -33,6 +40,13 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+            if (bools[index] == 1) {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..57f97ec 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include <chrono>
 #include <stdexcept>
+#include <device_launch_parameters.h>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..fcbb505 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -20,6 +20,10 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+              odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -31,8 +35,14 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+              if (idata[i] != 0) {
+                odata[count++] = idata[i];
+              }
+            }
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -43,8 +53,32 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int* temp = new int[n];
+            for (int i = 0; i < n; i++) {
+              temp[i] = idata[i] == 0 ? 0 : 1;
+            }
+            //scan result
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+              odata[i] = odata[i - 1] + temp[i - 1];
+            }
+            //int count = odata[n - 1];
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+              if (idata[i] != 0) {
+                odata[odata[i]] = idata[i];
+                count++;
+              }
+            }
+            timer().endCpuTimer();
+            delete temp;
+            return count;
+        }
+
+        void sort(int n, int* idata) {
+            timer().startCpuTimer();
+            std::sort(idata, idata + n);
             timer().endCpuTimer();
-            return -1;
         }
     }
 }
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 873c047..9ca5664 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -11,5 +11,7 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata);
 
         int compactWithScan(int n, int *odata, const int *idata);
+
+        void sort(int n, int* idata);
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..a5030f9 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -1,8 +1,13 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <algorithm>
 #include "common.h"
 #include "efficient.h"
 
+
+#define blockSize 128
+#define THREAD_OPTIMIZATION 1;
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,13 +17,101 @@ namespace StreamCompaction {
             return timer;
         }
 
-        /**
+        __global__ void kernUpSweep(int n, int d, int* data){
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= n) {
+                return;
+            }
+
+#if THREAD_OPTIMIZATION
+            int stride = 1 << (d + 1);
+            k = (k + 1) * stride - 1;
+            data[k] += data[k - (stride >> 1)];
+#else
+            int stride = 1 << (d + 1);
+            if (k % stride == 0) {
+                data[k + stride - 1] += data[k + (1 << d) - 1];
+            }
+#endif
+            
+
+
+
+        }
+
+        __global__ void kernDownSweep(int n, int d, int* data) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= n) {
+                return;
+            }
+
+            
+#if THREAD_OPTIMIZATION
+            int stride = 1 << (d + 1);
+            k = (k + 1) * stride - 1;
+            int t = data[k - (stride >> 1)];
+            data[k - (stride >> 1)] = data[k];
+            data[k] += t;
+#else
+            int stride = 1 << (d + 1);
+            int pow_2 = 1 << d;
+            if (k % stride == 0) {
+                int t = data[k + pow_2 - 1];
+                data[k + pow_2 - 1] = data[k + stride - 1];
+                data[k + stride - 1] = t + data[k + stride - 1];
+            }
+#endif
+        }
+
+        void perfixSumScan(int size, int* idata) {
+            dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+            int threads = size;
+            for (int d = 0; d < ilog2ceil(size); d++) {
+#if THREAD_OPTIMIZATION
+                dim3 blocksPerGrid((threads + blockSize - 1) / blockSize);
+                threads /= 2;
+                kernUpSweep <<< blocksPerGrid, blockSize >> > (threads, d, idata);
+                
+#else
+                kernUpSweep << < fullBlocksPerGrid, blockSize >> > (size, d, idata);
+#endif
+            }
+            //set last element  to 0
+            
+            int val = 0;           
+            cudaMemcpy(&idata[size - 1], &val, sizeof(int), cudaMemcpyHostToDevice);
+
+            //threads is already 1
+            for (int d = ilog2ceil(size) - 1; d >= 0; d--) {
+#if THREAD_OPTIMIZATION
+                dim3 blocksPerGrid((threads + blockSize - 1) / blockSize);
+                kernDownSweep << < blocksPerGrid, blockSize >> > (threads, d, idata);
+                threads *= 2;
+#else
+                kernDownSweep << < fullBlocksPerGrid, blockSize >> > (size, d, idata);
+#endif
+            }
+        }
+        /**uyb
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            int* dev_data;
+                 
+            //for non-pow2
+            int size = 1 << ilog2ceil(n);
+            cudaMalloc((void**)&dev_data, size * sizeof(int));
+            cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+           
+            
+            timer().startGpuTimer();     
             // TODO
+            perfixSumScan(size, dev_data);   
             timer().endGpuTimer();
+            
+            cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_data);
         }
 
         /**
@@ -31,10 +124,124 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+            int* dev_bool;
+            int* dev_idata;
+            int* dev_odata;
+            int* dev_scanResult;
+
+            int count = 0;
+            int lastBool = 0;
+
+            int size = 1 << ilog2ceil(n);
+
+            cudaMalloc((void**)&dev_bool, n * sizeof(int));
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            cudaMalloc((void**)&dev_scanResult, size * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
             // TODO
+            Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, dev_bool, dev_idata);
+            cudaMemcpy(dev_scanResult, dev_bool, n * sizeof(int), cudaMemcpyDeviceToDevice);
+            perfixSumScan(size, dev_scanResult);
+            Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_bool, dev_scanResult);
             timer().endGpuTimer();
-            return -1;
+
+            //last boolean is not counted in exclusive scan, if last bool is 1, need to take this into account
+            //This is not shown in the slides
+            cudaMemcpy(&count, dev_scanResult + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&lastBool, dev_bool + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+            int length = count + lastBool;
+            cudaMemcpy(odata, dev_odata, sizeof(int) * length, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_bool);
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_scanResult);
+
+            return length;
+        }
+
+
+        __global__ void kernNegate(int n, int k, int* data, int* bools) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            int curBit = (data[index] & (1 << k)) >> k;
+            bools[index] = curBit == 1 ? 0 : 1;
+
+        }
+
+        __global__ void kernSplit(int n, int k, int totalFalses, int* scan, int* idata, int* odata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            int cur = idata[index];
+            int curBit = (cur & (1 << k)) >> k;                             
+            int scanIdx = scan[index];
+           // odata[index] = curBit == 0 ? idata[scanIdx] : idata[index - scanIdx + totalFalses];
+            if (curBit == 0) {
+                odata[scanIdx] = cur;
+            } else {
+                odata[index - scanIdx + totalFalses] = cur;
+            }
+
+        }
+
+
+        void radixSort(int n, int* odata, const int* idata) {
+            
+            int* dev_idata;
+            int* dev_odata;
+            int* dev_scanResult;
+            int* dev_bool;
+
+            int size = 1 << ilog2ceil(n);
+            int lastBool = 0;
+            int lastCount = 0;
+            int totalFalses = 0;
+
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            cudaMalloc((void**)&dev_bool, n * sizeof(int));
+            cudaMalloc((void**)&dev_scanResult, size * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            timer().startGpuTimer();
+
+            int max = *std::max_element(idata, idata + n);
+            int numBits = ilog2ceil(max);
+
+            for (int i = 0; i < numBits; i++) {
+                kernNegate << <fullBlocksPerGrid, blockSize >> > (n, i, dev_idata, dev_bool);
+                cudaMemcpy(&lastBool, dev_bool + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+                cudaMemcpy(dev_scanResult, dev_bool, n * sizeof(int), cudaMemcpyDeviceToDevice);
+                perfixSumScan(size, dev_scanResult);
+                cudaMemcpy(&lastCount, dev_scanResult + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+                totalFalses = lastBool + lastCount;
+                kernSplit << <fullBlocksPerGrid, blockSize >> > (n, i, totalFalses, dev_scanResult, dev_idata, dev_odata);
+                std::swap(dev_idata, dev_odata);
+            }
+            timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_bool);
+            cudaFree(dev_scanResult);
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..8efb4f4 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -9,5 +9,7 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata);
 
         int compact(int n, int *odata, const int *idata);
+
+        void radixSort(int n, int* odata, const int* idata);
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..77f5a02 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,10 @@
 #include "common.h"
 #include "naive.h"
 
+
+
+#define blockSize 128
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,14 +16,48 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
-
+        __global__ void kernScan(int n, int d, int* odata, const int* idata) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= n) {
+                return;
+            }
+            int stride = 1 << (d - 1);
+            if (k >= stride) {
+                odata[k] = idata[k - stride] + idata[k];
+            }
+            else {
+                odata[k] = idata[k];
+            }
+        }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
+         */     
         void scan(int n, int *odata, const int *idata) {
+            int* dev_idata;
+            int* dev_odata;
+
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+          //  cudaMemcpy(dev_odata, odata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
             // TODO
+            for (int d = 1; d <= ilog2ceil(n); d++) {
+                kernScan << <fullBlocksPerGrid, blockSize >> > (n, d, dev_odata, dev_idata);
+                std::swap(dev_odata, dev_idata);
+            }
             timer().endGpuTimer();
+            //convert from inclusive scan to exclusive scan
+            odata[0] = 0;
+            cudaMemcpy(odata + 1, dev_idata, sizeof(int) * (n - 1), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..ed16d1e 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,17 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            thrust::host_vector<int> host_data(idata, idata + n);
+            thrust::device_vector<int> dev_in(host_data);
+            thrust::device_vector<int> dev_out(host_data);
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin());
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_out.data().get(), sizeof(int) * n, cudaMemcpyDeviceToHost);
         }
     }
 }