diff --git a/README.md b/README.md
index 0e38ddb..ff3ce43 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,296 @@
-CUDA Stream Compaction
+University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2 - CUDA Stream Compaction
 ======================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
+* Di Lu
   * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Tested on: Windows 11, i7-12700H @ 2.30GHz 32GB, NVIDIA GeForce RTX 3050 Ti
+
+## Introduction
+
+In this project, I implemented the following algorithms on the GPU and tested them:
+
+1. Exclusive Scan (CPU, Naive Scan, Work-Efficient Scan, Thrust Scan) - given an array A, output another array B such that each element b\[i\]
+is a sum of a\[0\] + ... + a\[i - 1\] excluding itself.
+2. Stream Compaction - given an array A, output another array B that only contains elements from A which match a criteria.
+
+## Implementation and Results
+#### CPU: Sequential Scan 
+
+* On the CPU, a sequential scan consists of a simple for-loop that loops through all the elements
+of input array A, and outputs to B[i] = B[i - 1] + A[i - 1]
+
+#### CPU: Stream Compact without Scan
+
+* Stream compact without Scan will simply keep a counter of the current write-index in the output array B, 
+while iterating through input array A. If A[i] != 0, then B[output_index] = A[i].
+
+#### CPU: Stream Compact with Scan
+
+* Stream compact with scan is a sequential implementation of the stream compact algorithm for GPU, except on the CPU
+and thus none of the advantages of parallel programming will come into play. See the following section for GPU stream
+compaction algorithms. 
+
+#### GPU: Naive GPU Scan
+
+* A naive GPU scan will take an input array (or read array), and add each sequential pair of elements together
+into an output array (or write array), ping-pong the arrays (read is now write and vice versa), increment the
+additive offset, and then repeat the operation until there is only one output (the final sum). Here, each addition
+can be done in a parallel manner since we will never write to the same array slot. The only caveat is that you must
+wait for each level to finish its read/write before moving on to the next level.
+
+#### GPU: Work-Efficient GPU Scan
+
+* A work efficient exclusive GPU scan uses only one array and can do the operation in place. This array is treated as a balanced
+binary tree, where the left child always retains its original value from the read array. This will allow us to 
+retain some of the original values to help us recover the entire scan array after we do a down sweep.
+
+1. Up Sweep
+For each consecutive pair of integers, add them and store the sum in the right-side partner's slot, overwriting it.
+Do this until you only have two partners to add with one resulting sum, which will be written to the last array slot.
+Now your array should have alternating "original" and "new" values. 
+
+3. Down Sweep
+Set the last integer in the array to 0. This is your root node. Starting from the root, set a node's left child to itself, 
+and sets its right child's value to a sum of itself and the previous left child's value (i.e, the sibling node's value).
+
+#### GPU: Thrust Scan
+
+* This scan is fairly simple to implement, as thrust::scan is a built-in function in the Thrust library.
+
+#### GPU: Stream Compaction
+
+* On the GPU, stream compaction consists of the following steps: 
+
+1. For each element in A, compute a "boolean" array such that: b[i] = 1 if A[i] meets the criteria, and b[i] = 0 otherwise.
+2. Run exclusive scan on the boolean array b to produce array s.
+3. The final output array is computed such that: if b[i] = 1, then write A[i] to index s[i] in the output array.
+
+## Output Example
+
+The following is an example of the output for N = 2^20
+```
+****************
+** SCAN TESTS **
+****************
+    [  18  19  38  39  17  17  11  34   0  18  47  37  37 ...   2   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 2.1343ms    (std::chrono Measured)
+    [   0  18  37  75 114 131 148 159 193 193 211 258 295 ... 25679509 25679511 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 2.0678ms    (std::chrono Measured)
+    [   0  18  37  75 114 131 148 159 193 193 211 258 295 ... 25679459 25679462 ]
+    passed
+==== naive scan, power-of-two ====
+blockSize: 256
+   elapsed time: 1.40179ms    (CUDA Measured)
+    [   0  18  37  75 114 131 148 159 193 193 211 258 295 ... 25679509 25679511 ]
+    passed
+==== naive scan, non-power-of-two ====
+blockSize: 256
+   elapsed time: 1.40688ms    (CUDA Measured)
+    [   0  18  37  75 114 131 148 159 193 193 211 258 295 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+blockSize: 256
+   elapsed time: 1.63549ms    (CUDA Measured)
+    [   0  18  37  75 114 131 148 159 193 193 211 258 295 ... 25679509 25679511 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+blockSize: 256
+   elapsed time: 1.6831ms    (CUDA Measured)
+    [   0  18  37  75 114 131 148 159 193 193 211 258 295 ... 25679459 25679462 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.404608ms    (CUDA Measured)
+    [   0  18  37  75 114 131 148 159 193 193 211 258 295 ... 25679509 25679511 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.3792ms    (CUDA Measured)
+    [   0  18  37  75 114 131 148 159 193 193 211 258 295 ... 25679459 25679462 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   1   2   3   1   1   1   2   0   0   3   3   3 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 2.3575ms    (std::chrono Measured)
+    [   1   2   3   1   1   1   2   3   3   3   1   2   2 ...   1   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 2.4421ms    (std::chrono Measured)
+    [   1   2   3   1   1   1   2   3   3   3   1   2   2 ...   1   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 6.5362ms    (std::chrono Measured)
+    [   1   2   3   1   1   1   2   3   3   3   1   2   2 ...   1   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+blockSize: 256
+   elapsed time: 3.06957ms    (CUDA Measured)
+    [   1   2   3   1   1   1   2   3   3   3   1   2   2 ...   1   1 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+blockSize: 256
+   elapsed time: 2.88675ms    (CUDA Measured)
+    [   1   2   3   1   1   1   2   3   3   3   1   2   2 ...   1   1 ]
+    passed
+```
+
+## Testing Methods
+
+To test my program, each run of the program will produce a random array of numbers between a given
+range. I varied the length of my arrays from size 4 up to 2^25. I also tried arrays with and without
+a 0 at the very end.
+
+## Performance Analysis
+
+* Roughly optimize the block sizes of each of your implementations for minimal
+  run time on your GPU.
+  * In the following performance analysis, I used N = 2^25 array size
+  
+  ![](img/blocksizeTable.png)
+  
+    _Table. 1 Blocksize vs. Results for each algorithm_
+    
+  ![](img/blocksize.png)
+  
+  _Fig. 1 Blocksize vs. Results for each algorithm_
+
+* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and
+  Thrust) to the serial CPU version of Scan. Plot a graph of the comparison
+  (with array size on the independent axis).
+ #### Scan
+  ![](img/scanTable.png)
+  
+  _Table. 2 Scan Algorithm: Number of Elements vs. Results_
+  
+  ![](img/scanpow2.png)
+  
+   _Fig. 2 Scan Algorithm: Number of Elements vs. Results Power of 2_
+    
+  ![](img/scannotpow2.png)
+  
+  _Fig. 3 Scan Algorithm: Number of Elements vs. Results NON Power of 2_
+  
+  In Scan, the performance results between Non-Power of 2 arrays and power of 2 arrays are fairly similar.
+  As expected, CPU scan is the least efficient algorithm when there are many elements. Naive scan is slightly
+  more efficient, although not by a lot. Efficient GPU scan does technically run faster than CPU and Naive, but 
+  the effects are not seen unless the size of the input array is above 1.5 million. Thrust scan is the fastest
+  out of all of them. 
+
+* Write a brief explanation of the phenomena you see here.
+  * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is
+    it different for each implementation?
+    
+I am not surprised that the GPU efficient implementation isn't faster until the size of the input array gets
+    significantly large. This is because I didn't optimize my implementations to ensure that threads are not idling. 
+    For example, I'm only using every other thread, then every 4 threads, etc. during scan. A way to bypass this is to
+    launch fewer threads, but computing the destination it should go to based on its idx and the current up-sweep or
+    down-sweep index.
+    
+Additionally, I ensured that my GPU timers included as few cudaMallocs, Memcpys, and Memsets as possible, but it's not
+    always feasible to move all of them outside of my "main" algorithm code, so to speak. For example, I need to call 
+    cudaMalloc on an array where I only know its length after a value is computed during the algorithm. Hence, I can only cudaMalloc
+    within the algorithm.
+    
+  * To guess at what might be happening inside the Thrust implementation (e.g.
+    allocation, memory copy), take a look at the Nsight timeline for its
+    execution. Your analysis here doesn't have to be detailed, since you aren't
+    even looking at the code for the implementation.
+    
+    
+  
+ #### Stream Compaction
+  ![](img/streamTable.png)
+  
+  _Table. 3 Stream Compaction Algorithm: Number of Elements vs. Results_
+  
+  ![](img/streampow2.png)
+  
+  _Fig. 4 Stream Algorithm: Number of Elements vs. Results Power of 2_
+  
+  ![](img/streamnotpow2.png)
+  
+  _Fig. 5 Steram Algorithm: Number of Elements vs. Results NON Power of 2_
+
+* Paste the output of the test program into a triple-backtick block in your
+  README.
+  * See above
+
+## NSight Performance Confusion
+
+When running my program in Nsight, the results are extremely fast for N = 2^25 compared to 
+Release mode in Visual Studio. I'm not exactly sure why.
+
+```****************
+** SCAN TESTS **
+****************
+    [  17  20  19   7  22  46  11   3  35  15  31  39  34 ...  20   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 2.0798ms    (std::chrono Measured)
+    [   0  17  37  56  63  85 131 142 145 180 195 226 265 ... 25666495 25666515 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 2.0982ms    (std::chrono Measured)
+    [   0  17  37  56  63  85 131 142 145 180 195 226 265 ... 25666419 25666467 ]
+    passed
+==== naive scan, power-of-two ====
+blockSize: 256
+   elapsed time: 1.716ms    (CUDA Measured)
+    [   0  17  37  56  63  85 131 142 145 180 195 226 265 ... 25666495 25666515 ]
+    passed
+==== naive scan, non-power-of-two ====
+blockSize: 256
+   elapsed time: 2.00374ms    (CUDA Measured)
+    [   0  17  37  56  63  85 131 142 145 180 195 226 265 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+blockSize: 256
+   elapsed time: 2.3617ms    (CUDA Measured)
+    [   0  17  37  56  63  85 131 142 145 180 195 226 265 ... 25666495 25666515 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+blockSize: 256
+   elapsed time: 2.23104ms    (CUDA Measured)
+    [   0  17  37  56  63  85 131 142 145 180 195 226 265 ... 25666419 25666467 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.567744ms    (CUDA Measured)
+    [   0  17  37  56  63  85 131 142 145 180 195 226 265 ... 25666495 25666515 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.566816ms    (CUDA Measured)
+    [   0  17  37  56  63  85 131 142 145 180 195 226 265 ... 25666419 25666467 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   2   3   1   2   0   1   3   3   3   1   1   0 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 2.3101ms    (std::chrono Measured)
+    [   3   2   3   1   2   1   3   3   3   1   1   3   1 ...   2   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 2.2745ms    (std::chrono Measured)
+    [   3   2   3   1   2   1   3   3   3   1   1   3   1 ...   2   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 6.392ms    (std::chrono Measured)
+    [   3   2   3   1   2   1   3   3   3   1   1   3   1 ...   2   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+blockSize: 256
+   elapsed time: 4.70064ms    (CUDA Measured)
+    [   3   2   3   1   2   1   3   3   3   1   1   3   1 ...   2   2 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+blockSize: 256
+   elapsed time: 3.8247ms    (CUDA Measured)
+```
+
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/img/blocksize.png b/img/blocksize.png
new file mode 100644
index 0000000..db7cf40
Binary files /dev/null and b/img/blocksize.png differ
diff --git a/img/blocksizeTable.png b/img/blocksizeTable.png
new file mode 100644
index 0000000..951f8f8
Binary files /dev/null and b/img/blocksizeTable.png differ
diff --git a/img/elems.png b/img/elems.png
new file mode 100644
index 0000000..d5ac6b0
Binary files /dev/null and b/img/elems.png differ
diff --git a/img/scan.png b/img/scan.png
new file mode 100644
index 0000000..a19a4b3
Binary files /dev/null and b/img/scan.png differ
diff --git a/img/scanTable.png b/img/scanTable.png
new file mode 100644
index 0000000..6616e02
Binary files /dev/null and b/img/scanTable.png differ
diff --git a/img/scannotpow2.png b/img/scannotpow2.png
new file mode 100644
index 0000000..253ebba
Binary files /dev/null and b/img/scannotpow2.png differ
diff --git a/img/scanpow2.png b/img/scanpow2.png
new file mode 100644
index 0000000..817de79
Binary files /dev/null and b/img/scanpow2.png differ
diff --git a/img/stream.png b/img/stream.png
new file mode 100644
index 0000000..9699f10
Binary files /dev/null and b/img/stream.png differ
diff --git a/img/streamTable.png b/img/streamTable.png
new file mode 100644
index 0000000..cad7356
Binary files /dev/null and b/img/streamTable.png differ
diff --git a/img/streamnotpow2.png b/img/streamnotpow2.png
new file mode 100644
index 0000000..cf1d8e0
Binary files /dev/null and b/img/streamnotpow2.png differ
diff --git a/img/streampow2.png b/img/streampow2.png
new file mode 100644
index 0000000..3ea4e41
Binary files /dev/null and b/img/streampow2.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..67d5d7b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,9 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+// const int SIZE = 1 << 8; // feel free to change the size of array
+// todo bring back
+const int SIZE = 1 << 20;
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -28,6 +30,7 @@ int main(int argc, char* argv[]) {
     printf("****************\n");
 
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    // genArray(SIZE, a, 50); // for cases without 0 at the end
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -51,7 +54,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,35 +67,35 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -103,6 +106,7 @@ int main(int argc, char* argv[]) {
     // Compaction tests
 
     genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+    //genArray(SIZE, a, 4); // Case without 0 at the end;
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -137,18 +141,68 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
+    printf("\n");
+    printf("**********************\n");
+    printf("** RADIX SORT TESTS **\n");
+    printf("**********************\n");
+
+    // Radix Sort tests
+
+    genArray(SIZE - 1, a, 8);  // Leave a 0 at the end to test that edge case
+    //genArray(SIZE, a, 4); // Case without 0 at the end;
+    a[SIZE - 1] = 0;
+    printArray(SIZE, a, true);
+
+    int* d = new int[SIZE];
+    int* e = new int[SIZE];
+
+    zeroArray(SIZE, b);
+    printDesc("control case thrust sort, power-of-two");
+    StreamCompaction::Thrust::sort(SIZE, b, a);
+    printArray(SIZE, b, true);
+
+    zeroArray(SIZE, c);
+    printDesc("control case thrust sort, non-power-of-two");
+    StreamCompaction::Thrust::sort(NPOT, c, a);
+    printArray(NPOT, c, true);
+    // do not comapre to array b. 
+
+    zeroArray(SIZE, d);
+    printDesc("radix sort, power-of-two");
+    RadixSort::radixSort(SIZE, d, a);
+    printArray(SIZE, d, true);
+    printCmpResult(SIZE, d, b);
+
+    /* zeroArray(SIZE, b);
+    printDesc("radix sort, power-of-two");
+    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    expectedCount = count;
+    printArray(count, b, true);
+    printCmpLenResult(count, expectedCount, b, b);
+
+    zeroArray(SIZE, b);
+    printDesc("radix sort, non power-of-two");
+    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    expectedCount = count;
+    printArray(count, b, true);
+    printCmpLenResult(count, expectedCount, b, b);*/
+
     system("pause"); // stop Win32 console from closing on exit
     delete[] a;
     delete[] b;
     delete[] c;
+    delete[] d;
+    delete[] e;
 }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..21c6943 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -13,26 +13,43 @@ namespace StreamCompaction {
         }
 
         /**
-         * CPU scan (prefix sum).
+         * CPU scan (prefix sum). 
          * For performance analysis, this is supposed to be a simple for loop.
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            // perform exclusive prefix sum
+            for (int i = 0; i < n; i++) {
+                if (i == 0) {
+                    odata[i] = 0;
+                }
+                else {
+                    odata[i] = odata[i - 1] + idata[i - 1];
+                }
+            }
             timer().endCpuTimer();
         }
 
         /**
          * CPU stream compaction without using the scan function.
-         *
+         * 
          * @returns the number of elements remaining after compaction.
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            // removes digits that are 0
+            int oi = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[oi] = idata[i];
+                    oi++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+
+            // return number of valid digits
+            return oi;
         }
 
         /**
@@ -43,8 +60,46 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int* tdata = new int[n];
+            int* sdata = new int[n];
+
+            int numvalid = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    tdata[i] = 1;
+                }
+                else {
+                    // todo remove if initialize to 0 automatically
+                    tdata[i] = 0;
+                }
+            }
+
+            // perform scan on tdata to produce sdata
+            // perform exclusive prefix sum
+            for (int i = 0; i < n; i++) {
+                if (i == 0) {
+                    sdata[i] = 0;
+                }
+                else {
+                    sdata[i] = sdata[i - 1] + tdata[i - 1];
+                }
+            }
+
+            // scatter: iterate over tdata. if tdata is 1, write idata[i] to odata[sdata[i]]
+            // where odata.size is last elem of sdata
+            for (int i = 0; i < n; i++) {
+                if (tdata[i] == 1) {
+                    odata[sdata[i]] = idata[i];
+                }
+            }
+
+            int numValid = sdata[n - 1] + tdata[n - 1];
+
+            delete[] tdata;
+            delete[] sdata;
+
             timer().endCpuTimer();
-            return -1;
+            return numValid;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..613f7f4 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,6 +2,9 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include "iostream"
+
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
 
 namespace StreamCompaction {
     namespace Efficient {
@@ -12,13 +15,190 @@ namespace StreamCompaction {
             return timer;
         }
 
+        /*! Block size used for CUDA kernel launch. */
+        #define blockSizeEffScan 256
+        #define blockSizeCompact 256
+
+        // fix this and use new function1
+        __global__ void kernReductionHelper(int n, int offset, int* tdata) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (idx < n && (idx + 1) % offset == 0) {
+                int neighborLoc = offset / 2;
+                int a = tdata[idx - neighborLoc];
+                int b = tdata[idx];
+                tdata[idx] = a + b;
+            }
+        }
+
+        // use function from class
+        __global__ void kernPartialSumHelper(int n, int offset, int* tdata) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (idx < n && (idx + 1) % offset == 0) {
+                int neighborLoc = offset / 2;
+                int a = tdata[idx - neighborLoc];
+                int b = tdata[idx];
+
+                tdata[idx - neighborLoc] = b;
+                tdata[idx] = a + b;
+            }
+        }
+
+        // shift all nums one to the left
+        __global__ void kernShiftLeft(int n, int* odata, int* idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index < n) {
+                if (index > 0) {
+                    odata[index - 1] = idata[index];
+                }
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int fullBlocksPerArray = (n + blockSizeEffScan - 1) / blockSizeEffScan;
+
+            // for performance analysis
+            printf("blockSize: %i \n", blockSizeEffScan);
+
+            // shift tidata to the right to prepend 0's.
+            int nextPowTwo = ilog2ceil(n);
+            int numZeroes = pow(2, nextPowTwo) - n;
+
+            // 1. up sweep same as reduction
+            // empty buffer as idata
+            // malloc enough space for n and 0's
+            int* dev_tidata;
+            cudaMalloc((void**)&dev_tidata, (n + numZeroes) * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_tifailed failed!");
+
+            // create output array for shifting
+            int* dev_tidataFinal;
+            cudaMalloc((void**)&dev_tidataFinal, (n + numZeroes) * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_tidataFinal failed!");
+
+            // set all elems to 0
+            cudaMemset(dev_tidata, 0, (n + numZeroes) * sizeof(int));
+            checkCUDAErrorWithLine("cudaMemset all elems to 0 failed!");
+
+            // copy contents of idata into tidata so we can just pass in tidata and modify that on every pass.
+            // antyhing after n + numZeroes is all 0's
+            cudaMemcpy(dev_tidata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorWithLine("cudaMemcpy idata to dev_tidata failed!");
+
+            // start gpu timer
             timer().startGpuTimer();
-            // TODO
+
+            int depth = 0;
+
+            for (depth = 1; depth <= ilog2ceil(n + numZeroes); depth++) {
+                int offset = pow(2, depth);
+                kernReductionHelper << <fullBlocksPerArray, blockSizeEffScan>>>(n + numZeroes, offset, dev_tidata);
+                // wait for cuda timer. wait for all threads to finish
+                cudaDeviceSynchronize();
+            }
+
+            // set last int of array to 0
+            cudaMemset(&dev_tidata[n + numZeroes - 1], 0, sizeof(int));
+            checkCUDAErrorWithLine("cudaMemset last int failed!");
+
+            // 2. down sweep
+            // takes dev_tidata as input
+            for (depth; depth >= 1; depth--) {
+                int offset = pow(2, depth);
+                kernPartialSumHelper << <fullBlocksPerArray, blockSizeEffScan >> > (n + numZeroes, offset, dev_tidata);
+                cudaDeviceSynchronize();
+            }
+
+            // stop gpu timer
             timer().endGpuTimer();
+
+            // copy final result to odata
+            cudaMemcpy(odata, dev_tidata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorWithLine("cudaMemcpy dev_tidata to odata failed!");
+
+            // free all buffers
+            cudaFree(dev_tidata);
+            cudaFree(dev_tidataFinal);
+        }
+
+        __global__ void kernMapToBoolean(int n, int* odata, int* idata) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (idx < n) {
+                if (idata[idx] != 0) {
+                    // add 1 to out array
+                    odata[idx] = 1;
+                }
+                else {
+                    // add 0 to outarray
+                    odata[idx] = 0;
+                }
+            }
+        }
+
+        __global__ void kernScatter(int n, int* odata, int* idata, int* tdata, int* sdata) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (idx < n) {
+                if (tdata[idx] == 1) {
+                    int destinationIdx = sdata[idx];
+                    odata[destinationIdx] = idata[idx];
+                }
+                // otherwise do not write
+            }
+        }
+
+        // reimplement scan for compact
+        void compactScan(int n, int* dev_odata, int* dev_idata) {
+            int fullBlocksPerArray = (n + blockSizeCompact - 1) / blockSizeCompact;
+
+            // shift tidata to the right to prepend 0's.
+            int nextPowTwo = ilog2ceil(n);
+            int numZeroes = pow(2, nextPowTwo) - n;
+
+            // 1. up sweep same as reduction
+            // empty buffer as idata
+            // malloc enough space for n and 0's
+            int* dev_tidata;
+            cudaMalloc((void**)&dev_tidata, (n + numZeroes) * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_tifailed failed!");
+
+            // set all elems to 0
+            cudaMemset(dev_tidata, 0, (n + numZeroes) * sizeof(int));
+            checkCUDAErrorWithLine("cudaMemset all elems to 0 failed!");
+
+            // copy contents of idata into tidata so we can just pass in tidata and modify that on every pass.
+            // antyhing after n + numZeroes is all 0's
+            cudaMemcpy(dev_tidata, dev_idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorWithLine("cudaMemcpy idata to dev_tidata failed!");
+
+            int depth = 0;
+
+            for (depth = 1; depth <= ilog2ceil(n + numZeroes); depth++) {
+                int offset = 1 << depth; // pow(2, depth);
+                kernReductionHelper << <fullBlocksPerArray, blockSizeCompact >> > (n + numZeroes, offset, dev_tidata);
+                // wait for cuda timer. wait for all threads to finish
+                cudaDeviceSynchronize();
+            }
+
+            // set last int of array to 0
+            cudaMemset(&dev_tidata[n + numZeroes - 1], 0, sizeof(int));
+            checkCUDAErrorWithLine("cudaMemset last int failed!");
+
+            // 2. down sweep
+            // takes dev_tidata as input
+            for (depth; depth >= 1; depth--) {
+                int offset = pow(2, depth);
+                kernPartialSumHelper << <fullBlocksPerArray, blockSizeCompact >> > (n + numZeroes, offset, dev_tidata);
+                cudaDeviceSynchronize();
+            }
+
+            // copy final result to odata
+            cudaMemcpy(dev_odata, dev_tidata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorWithLine("cudaMemcpy dev_tidata to odata failed!");
+
+            // free all buffers
+            cudaFree(dev_tidata);
         }
 
         /**
@@ -31,10 +211,198 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int fullBlocksPerArray = (n + blockSizeCompact - 1) / blockSizeCompact;
+
+            printf("blockSize: %i \n", blockSizeCompact);
+            // 1. compute temp array: 1 for everything that fits rule. 0 otherwise.
+            int* dev_iArray;
+            int* dev_tempArray;
+
+            cudaMalloc((void**)&dev_tempArray, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_tempArray failed!");
+
+            cudaMalloc((void**)&dev_iArray, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_iArray failed!");
+
+            cudaMemcpy(dev_iArray, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorWithLine("cudaMemcpy dev_iArray failed!");
+
+            int* dev_scanArray;
+            cudaMalloc((void**)&dev_scanArray, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_scanArray failed!");
+
+            // start gpu timer
             timer().startGpuTimer();
-            // TODO
+
+            kernMapToBoolean << <fullBlocksPerArray, blockSizeCompact >> > (n, dev_tempArray, dev_iArray);
+
+            // 2. exclusive scan on tempArray.
+            compactScan(n, dev_scanArray, dev_tempArray);
+
+            // 3. scatter
+            // last element of numScatters is the length of scatterArray.
+            int numScatters = 0;
+            int validSlot = 0;
+            cudaMemcpy(&numScatters, dev_scanArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&validSlot, dev_tempArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+            numScatters += validSlot;
+            checkCUDAErrorWithLine("cudaMemcpy numScatters failed!");
+
+            // unavoidable malloc
+            int* dev_scatterFinal;
+            cudaMalloc((void**)&dev_scatterFinal, numScatters * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_scatterFinal failed!");
+
+            kernScatter << <fullBlocksPerArray, blockSizeCompact >> > (n, dev_scatterFinal, dev_iArray, dev_tempArray, dev_scanArray);
+
+            // end gpu timer
             timer().endGpuTimer();
-            return -1;
+
+            // memcpy back from odata1 to odata
+            cudaMemcpy(odata, dev_scatterFinal, numScatters * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorWithLine("cudaMemcpy dev_scatterFinal to odata failed!");
+
+
+            cudaFree(dev_iArray);
+            cudaFree(dev_tempArray);
+            cudaFree(dev_scanArray);
+            cudaFree(dev_scatterFinal);
+
+            return numScatters;
         }
     }
 }
+
+namespace RadixSort {
+    //// todo import performance timer
+
+    /*! Block size used for CUDA kernel launch. */
+    #define blockSize 128
+
+    __global__ void kernMapToBoolean(int n, int* odata, int* idata, int bitpos) {
+        int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+        if (idx < n) {
+            int num = idata[idx];
+            int bit = (num & (1 << bitpos)) >> bitpos;
+            // printf("num: %i, bitpos: %i, bit: %i \n", num, bitpos, bit);
+
+            if (bit != 0) {
+                // add 1 to out array
+                odata[idx] = 1;
+                // printf("idx: %i, val: %i \n", idx, 1);
+            }
+            else {
+                // add 0 to outarray
+                odata[idx] = 0;
+                // printf("idx: %i, val: %i \n", idx, 0);
+            }
+        }
+    }
+
+    __global__ void kernComputeScatter(int n, int bitpos, int* ddata, int* bdata, int* tdata, int* fdata, int* idata) {
+        int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+        if (idx < n) {
+            if (bdata[idx] == 1) {
+                int numToScatter = idata[idx];
+                int dest = tdata[idx];
+                ddata[dest] = idata[idx];
+                // printf("bitpos: %i, scatter num: %i, destination: %i \n", bitpos, numToScatter, dest);
+            }
+        }
+    }
+
+    __global__ void kernComputeT(int n, int* tdata, int* fdata, int totalFalses) {
+        int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+        if (idx < n) {
+            tdata[idx] = idx - fdata[idx] + totalFalses;
+        }
+
+    }
+
+    __global__ void kernComputeE(int n, int* odata, int* idata) {
+        int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+        if (idx < n) {
+            odata[idx] = !idata[idx];
+        }
+    }
+
+    void split(int n, int* odata, int* idata, int bitpos) {
+        // the following operations should only be done on the bit specified by bitpos.
+        // todo implement radix sort
+        int fullBlocksPerArray = (n + blockSize - 1) / blockSize;
+
+        // 1. compute e array
+        int* dev_iarray;
+        cudaMalloc((void**)&dev_iarray, n * sizeof(int));
+        checkCUDAErrorWithLine("cudaMalloc dev_iarray failed!");
+        cudaMemcpy(dev_iarray, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+        checkCUDAErrorWithLine("cudaMemcpy dev_iarray failed!");
+
+        int* dev_barray;
+        cudaMalloc((void**)&dev_barray, n * sizeof(int));
+        checkCUDAErrorWithLine("cudaMalloc dev_barray failed!");
+
+        // call kernMapToBoolean to calculate b
+        kernMapToBoolean<<<fullBlocksPerArray, blockSize>>>(n, dev_barray, dev_iarray, bitpos);
+
+        //int* tempBuf = new int[n];
+        //cudaMemcpy(tempBuf, dev_barray, n * sizeof(int), cudaMemcpyDeviceToHost);
+        //checkCUDAErrorWithLine("cudaMempy dev_tarray failed!");
+
+        //for (int i = 0; i < n; i++) {
+        //    std::cout << "t: " << bitpos << " " << tempBuf[i] << std::endl;
+        //}
+
+        int* dev_earray;
+        cudaMalloc((void**)&dev_earray, n * sizeof(int));
+        checkCUDAErrorWithLine("cudaMalloc dev_earray failed!");
+
+        kernComputeE << <fullBlocksPerArray, blockSize >> > (n, dev_earray, dev_barray);
+
+        // 2. exclusive scan on e and store in f
+        int* dev_farray;
+        cudaMalloc((void**)&dev_farray, n * sizeof(int));
+        checkCUDAErrorWithLine("cudaMalloc dev_farray failed!");
+
+        // use compact scan because it doesn't presume the input is a host pointer.
+        StreamCompaction::Efficient::compactScan(n, dev_farray, dev_earray);
+
+        // 3. compute total # falses
+        int fArrayLast = 0;
+        int eArrayLast = 0;
+        cudaMemcpy(&fArrayLast, dev_farray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+        checkCUDAErrorWithLine("cudaMemcpy fArrayLast failed!");
+        cudaMemcpy(&eArrayLast, dev_earray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+        checkCUDAErrorWithLine("cudaMemcpy eArrayLast failed!");
+
+        int totalFalses = fArrayLast + eArrayLast;
+        
+        // 4. compute t array which contains the address for writing data.
+        int* dev_tarray;
+        cudaMalloc((void**)&dev_tarray, n * sizeof(int));
+        checkCUDAErrorWithLine("cudaMalloc dev_tarray failed!");
+
+        kernComputeT << <fullBlocksPerArray, blockSize >> > (n, dev_tarray, dev_farray, totalFalses);
+
+        // 5. scatter based on address d.
+        int* dev_scatterFinal; // final output. send to odata
+        cudaMalloc((void**)&dev_scatterFinal, n * sizeof(int));
+        checkCUDAErrorWithLine("cudaMalloc dev_scatterFinal failed!");
+
+        kernComputeScatter << <fullBlocksPerArray, blockSize >> > (n, bitpos, dev_scatterFinal, dev_barray, dev_tarray, dev_farray, dev_iarray);
+
+        // memcpy back to odata
+        cudaMemcpy(odata, dev_scatterFinal, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+        // delete[] tempBuf;
+    }
+
+    void radixSort(int n, int* odata, int* idata) {
+        int numbits = 3;
+        for (int i = 0; i < numbits; i++) {
+            split(n, odata, idata, i);
+        }
+    }
+}
+
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..b408609 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -11,3 +11,11 @@ namespace StreamCompaction {
         int compact(int n, int *odata, const int *idata);
     }
 }
+
+namespace RadixSort {
+
+    void split(int n, int* odata, int* idata, int bitpos);
+
+    void radixSort(int n, int* odata, int* idata);
+}
+
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..c162fce 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,24 +2,114 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include "iostream"
+
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
 
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
+
+        /*! Block size used for CUDA kernel launch. */
+        #define blockSizeNaive 256
+
         PerformanceTimer& timer()
         {
             static PerformanceTimer timer;
             return timer;
         }
+
         // TODO: __global__
+        // offset calculated from depth
+        __global__ void kernScanHelper(int n, int offset, int *odata, int *idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index < n) {
+                if (index >= offset) {
+                    int a = idata[index - offset];
+                    int b = idata[index];
+                    odata[index] = a + b;
+                }
+                else {
+                     odata[index] = idata[index];
+                }
+            }
+        }
+
+        __global__ void kernPrependZero(int n, int* odata, int* idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index < n) {
+                if (index == 0) {
+                    odata[index] = 0;
+                }
+                else {
+                    odata[index] = idata[index - 1];
+                }
+            }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int fullBlocksPerArray = (n + blockSizeNaive - 1) / blockSizeNaive;
+
+            printf("blockSize: %i \n", blockSizeNaive);
+
+            // empty buffer as odata1 and odata2
+            int* dev_odata1;
+            int* dev_odata2;
+            int* dev_odataFinal;
+
+            cudaMalloc((void**)&dev_odata1, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_odata1 failed!");
+
+            cudaMalloc((void**)&dev_odata2, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_odata2 failed!");
+
+            // initialize new odataFinal with enough space with 1 in front
+            cudaMalloc((void**)&dev_odataFinal, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_odataFinal failed!");
+
+            // copy contents of idata into odata so we can just pass in odata.
+            // set values of odata2 to 0s
+            // exclusive scan will never set the 0th index
+            cudaMemset(dev_odata2, 0, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMemset dev_odata2 to 0'sfailed!");
+
+
+            // memcpy contents of idata to odata1
+            cudaMemcpy(dev_odata1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorWithLine("cudaMemcpy idata to dev_odata1 Host to Device failed!");
+
+            // start gpu timer
             timer().startGpuTimer();
-            // TODO
+
+            // for depth from 1 to 2^d-1, for each k in parallel, invoke scan on an offset and odata.
+            for (int depth = 0; depth < ilog2ceil(n); depth++) {
+                int offset = 1 << depth; // pow(2, depth);
+
+                kernScanHelper << <fullBlocksPerArray, blockSizeNaive >> > (n, offset, dev_odata2, dev_odata1);
+                // wait for threads
+                cudaDeviceSynchronize();
+
+                std::swap(dev_odata2, dev_odata1);
+            }
+
+            // bit shift towards the right and add 0 to front of odata, put it in odataFinal
+            // dev_odata1 should always have the most updated data.
+            kernPrependZero << <fullBlocksPerArray, blockSizeNaive >> > (n, dev_odataFinal, dev_odata1);
+
+            // end gpu timer
             timer().endGpuTimer();
+
+            // memcpy back from odata1 to odata
+            cudaMemcpy(odata, dev_odataFinal, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorWithLine("cudaMemcpy dev_odataFinal to odata failed!");
+
+            cudaFree(dev_odata1);
+            cudaFree(dev_odata2);
+            cudaFree(dev_odataFinal);
+
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..6e3e241 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -3,9 +3,12 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/scan.h>
+#include <thrust/sort.h>
 #include "common.h"
 #include "thrust.h"
 
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
 namespace StreamCompaction {
     namespace Thrust {
         using StreamCompaction::Common::PerformanceTimer;
@@ -18,10 +21,48 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            // timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorWithLine("cudaMemcpy dev_idata failed!");
+
+            int* dev_odata;
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_odata failed!");
+
+            thrust::device_ptr<int> dev_thrust_in = thrust::device_ptr<int>(dev_idata);
+            thrust::device_ptr<int> dev_thrust_out = thrust::device_ptr<int>(dev_odata);
+
+            // only time thrust scan
+            timer().startGpuTimer();
+            thrust::exclusive_scan(dev_thrust_in, dev_thrust_in + n, dev_thrust_out);
+            timer().endGpuTimer();
+
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorWithLine("cudaMempcy dev_odata failed!");
+        }
+
+        void sort(int n, int* odata, int* idata) {
+            timer().startGpuTimer();
+
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorWithLine("cudaMemcpy dev_idata failed!");
+
+            thrust::device_ptr<int> dev_thrust_in = thrust::device_ptr<int>(dev_idata);
+
+            thrust::sort(dev_thrust_in, dev_thrust_in + n);
+
+            cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
             timer().endGpuTimer();
         }
     }
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
index fe98206..994d8f0 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.h
@@ -7,5 +7,6 @@ namespace StreamCompaction {
         StreamCompaction::Common::PerformanceTimer& timer();
 
         void scan(int n, int *odata, const int *idata);
+        void sort(int n, int* odata, int* idata);
     }
 }