From 0fd35de86cbee114f78404b8ea2d72ba031754a6 Mon Sep 17 00:00:00 2001
From: miaokeda <inverse_boom@outlook.com>
Date: Sun, 16 Apr 2023 00:24:04 +0800
Subject: [PATCH 1/2] updated all todos

---
 .clang-format                  | 213 +++++++++++++++++++++++++++++++++
 stream_compaction/common.cu    |   8 ++
 stream_compaction/cpu.cu       |  34 +++++-
 stream_compaction/efficient.cu | 141 +++++++++++++++++++++-
 stream_compaction/naive.cu     |  41 ++++++-
 stream_compaction/thrust.cu    |   4 +
 6 files changed, 437 insertions(+), 4 deletions(-)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..e295476
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,213 @@
+# è¯­è¨€: None, Cpp, Java, JavaScript, ObjC, Proto, TableGen, TextProto
+Language: Cpp
+# BasedOnStyle:	LLVM
+
+# è®¿é—®è¯´æ˜Žç¬¦(publicã€privateç­‰)çš„åç§»
+AccessModifierOffset: -4
+
+# å¼€æ‹¬å·(å¼€åœ†æ‹¬å·ã€å¼€å°–æ‹¬å·ã€å¼€æ–¹æ‹¬å·)åŽçš„å¯¹é½: Align, DontAlign, AlwaysBreak(æ€»æ˜¯åœ¨å¼€æ‹¬å·åŽæ¢è¡Œ)
+AlignAfterOpenBracket: Align
+
+# è¿žç»­èµ‹å€¼æ—¶ï¼Œå¯¹é½æ‰€æœ‰ç­‰å·
+AlignConsecutiveAssignments: false
+
+# è¿žç»­å£°æ˜Žæ—¶ï¼Œå¯¹é½æ‰€æœ‰å£°æ˜Žçš„å˜é‡å
+AlignConsecutiveDeclarations: false
+
+# å³å¯¹é½é€ƒè„±æ¢è¡Œ(ä½¿ç”¨åæ–œæ æ¢è¡Œ)çš„åæ–œæ 
+AlignEscapedNewlines: Right
+
+# æ°´å¹³å¯¹é½äºŒå…ƒå’Œä¸‰å…ƒè¡¨è¾¾å¼çš„æ“ä½œæ•°
+AlignOperands: true
+
+# å¯¹é½è¿žç»­çš„å°¾éšçš„æ³¨é‡Š
+AlignTrailingComments: true
+
+# ä¸å…è®¸å‡½æ•°å£°æ˜Žçš„æ‰€æœ‰å‚æ•°åœ¨æ”¾åœ¨ä¸‹ä¸€è¡Œ
+AllowAllParametersOfDeclarationOnNextLine: false
+
+# ä¸å…è®¸çŸ­çš„å—æ”¾åœ¨åŒä¸€è¡Œ
+AllowShortBlocksOnASingleLine: true
+
+# å…è®¸çŸ­çš„caseæ ‡ç­¾æ”¾åœ¨åŒä¸€è¡Œ
+AllowShortCaseLabelsOnASingleLine: true
+
+# å…è®¸çŸ­çš„å‡½æ•°æ”¾åœ¨åŒä¸€è¡Œ: None, InlineOnly(å®šä¹‰åœ¨ç±»ä¸­), Empty(ç©ºå‡½æ•°), Inline(å®šä¹‰åœ¨ç±»ä¸­ï¼Œç©ºå‡½æ•°), All
+AllowShortFunctionsOnASingleLine: None
+
+# å…è®¸çŸ­çš„ifè¯­å¥ä¿æŒåœ¨åŒä¸€è¡Œ
+AllowShortIfStatementsOnASingleLine: true
+
+# å…è®¸çŸ­çš„å¾ªçŽ¯ä¿æŒåœ¨åŒä¸€è¡Œ
+AllowShortLoopsOnASingleLine: true
+
+# æ€»æ˜¯åœ¨è¿”å›žç±»åž‹åŽæ¢è¡Œ: None, All, TopLevel(é¡¶çº§å‡½æ•°ï¼Œä¸åŒ…æ‹¬åœ¨ç±»ä¸­çš„å‡½æ•°), 
+# AllDefinitions(æ‰€æœ‰çš„å®šä¹‰ï¼Œä¸åŒ…æ‹¬å£°æ˜Ž), TopLevelDefinitions(æ‰€æœ‰çš„é¡¶çº§å‡½æ•°çš„å®šä¹‰)
+AlwaysBreakAfterReturnType: None
+
+# æ€»æ˜¯åœ¨å¤šè¡Œstringå­—é¢é‡å‰æ¢è¡Œ
+AlwaysBreakBeforeMultilineStrings: false
+
+# æ€»æ˜¯åœ¨templateå£°æ˜ŽåŽæ¢è¡Œ
+AlwaysBreakTemplateDeclarations: true
+
+# falseè¡¨ç¤ºå‡½æ•°å®žå‚è¦ä¹ˆéƒ½åœ¨åŒä¸€è¡Œï¼Œè¦ä¹ˆéƒ½å„è‡ªä¸€è¡Œ
+BinPackArguments: true
+
+# falseè¡¨ç¤ºæ‰€æœ‰å½¢å‚è¦ä¹ˆéƒ½åœ¨åŒä¸€è¡Œï¼Œè¦ä¹ˆéƒ½å„è‡ªä¸€è¡Œ
+BinPackParameters: true
+
+# å¤§æ‹¬å·æ¢è¡Œï¼Œåªæœ‰å½“BreakBeforeBracesè®¾ç½®ä¸ºCustomæ—¶æ‰æœ‰æ•ˆ
+BraceWrapping:
+  # classå®šä¹‰åŽé¢
+  AfterClass: false
+  # æŽ§åˆ¶è¯­å¥åŽé¢
+  AfterControlStatement: false
+  # enumå®šä¹‰åŽé¢
+  AfterEnum: false
+  # å‡½æ•°å®šä¹‰åŽé¢
+  AfterFunction: false
+  # å‘½åç©ºé—´å®šä¹‰åŽé¢
+  AfterNamespace: false
+  # structå®šä¹‰åŽé¢
+  AfterStruct: false
+  # unionå®šä¹‰åŽé¢
+  AfterUnion: false
+  # externä¹‹åŽ
+  AfterExternBlock: false
+  # catchä¹‹å‰
+  BeforeCatch: false
+  # elseä¹‹å‰
+  BeforeElse: false
+  # ç¼©è¿›å¤§æ‹¬å·
+  IndentBraces: false
+  # åˆ†ç¦»ç©ºå‡½æ•°
+  SplitEmptyFunction: false
+  # åˆ†ç¦»ç©ºè¯­å¥
+  SplitEmptyRecord: false
+  # åˆ†ç¦»ç©ºå‘½åç©ºé—´
+  SplitEmptyNamespace: false
+
+# åœ¨äºŒå…ƒè¿ç®—ç¬¦å‰æ¢è¡Œ: None(åœ¨æ“ä½œç¬¦åŽæ¢è¡Œ), NonAssignment(åœ¨éžèµ‹å€¼çš„æ“ä½œç¬¦å‰æ¢è¡Œ), All(åœ¨æ“ä½œç¬¦å‰æ¢è¡Œ)
+BreakBeforeBinaryOperators: NonAssignment
+
+# åœ¨å¤§æ‹¬å·å‰æ¢è¡Œ: Attach(å§‹ç»ˆå°†å¤§æ‹¬å·é™„åŠ åˆ°å‘¨å›´çš„ä¸Šä¸‹æ–‡), Linux(é™¤å‡½æ•°ã€å‘½åç©ºé—´å’Œç±»å®šä¹‰ï¼Œä¸ŽAttachç±»ä¼¼), 
+#   Mozilla(é™¤æžšä¸¾ã€å‡½æ•°ã€è®°å½•å®šä¹‰ï¼Œä¸ŽAttachç±»ä¼¼), Stroustrup(é™¤å‡½æ•°å®šä¹‰ã€catchã€elseï¼Œä¸ŽAttachç±»ä¼¼), 
+#   Allman(æ€»æ˜¯åœ¨å¤§æ‹¬å·å‰æ¢è¡Œ), GNU(æ€»æ˜¯åœ¨å¤§æ‹¬å·å‰æ¢è¡Œï¼Œå¹¶å¯¹äºŽæŽ§åˆ¶è¯­å¥çš„å¤§æ‹¬å·å¢žåŠ é¢å¤–çš„ç¼©è¿›), WebKit(åœ¨å‡½æ•°å‰æ¢è¡Œ), Custom
+#   æ³¨ï¼šè¿™é‡Œè®¤ä¸ºè¯­å¥å—ä¹Ÿå±žäºŽå‡½æ•°
+BreakBeforeBraces: Custom
+
+# åœ¨ä¸‰å…ƒè¿ç®—ç¬¦å‰æ¢è¡Œ
+BreakBeforeTernaryOperators: false
+
+# åœ¨æž„é€ å‡½æ•°çš„åˆå§‹åŒ–åˆ—è¡¨çš„å†’å·åŽæ¢è¡Œ
+BreakConstructorInitializers: AfterColon
+
+#BreakInheritanceList: AfterColon
+
+BreakStringLiterals: false
+
+# æ¯è¡Œå­—ç¬¦çš„é™åˆ¶ï¼Œ0è¡¨ç¤ºæ²¡æœ‰é™åˆ¶
+ColumnLimit: 0
+
+CompactNamespaces: true
+
+# æž„é€ å‡½æ•°çš„åˆå§‹åŒ–åˆ—è¡¨è¦ä¹ˆéƒ½åœ¨åŒä¸€è¡Œï¼Œè¦ä¹ˆéƒ½å„è‡ªä¸€è¡Œ
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+
+# æž„é€ å‡½æ•°çš„åˆå§‹åŒ–åˆ—è¡¨çš„ç¼©è¿›å®½åº¦
+ConstructorInitializerIndentWidth: 4
+
+# å»¶ç»­çš„è¡Œçš„ç¼©è¿›å®½åº¦
+ContinuationIndentWidth: 4
+
+# åŽ»é™¤C++11çš„åˆ—è¡¨åˆå§‹åŒ–çš„å¤§æ‹¬å·{åŽå’Œ}å‰çš„ç©ºæ ¼
+Cpp11BracedListStyle: true
+
+# ç»§æ‰¿æœ€å¸¸ç”¨çš„æŒ‡é’ˆå’Œå¼•ç”¨çš„å¯¹é½æ–¹å¼
+DerivePointerAlignment: false
+
+# å›ºå®šå‘½åç©ºé—´æ³¨é‡Š
+FixNamespaceComments: true
+
+# ç¼©è¿›caseæ ‡ç­¾
+IndentCaseLabels: false
+
+IndentPPDirectives: None
+
+# ç¼©è¿›å®½åº¦
+IndentWidth: 4
+
+# å‡½æ•°è¿”å›žç±»åž‹æ¢è¡Œæ—¶ï¼Œç¼©è¿›å‡½æ•°å£°æ˜Žæˆ–å‡½æ•°å®šä¹‰çš„å‡½æ•°å
+IndentWrappedFunctionNames: false
+
+# ä¿ç•™åœ¨å—å¼€å§‹å¤„çš„ç©ºè¡Œ
+KeepEmptyLinesAtTheStartOfBlocks: false
+
+# è¿žç»­ç©ºè¡Œçš„æœ€å¤§æ•°é‡
+MaxEmptyLinesToKeep: 1
+
+# å‘½åç©ºé—´çš„ç¼©è¿›: None, Inner(ç¼©è¿›åµŒå¥—çš„å‘½åç©ºé—´ä¸­çš„å†…å®¹), All
+NamespaceIndentation: None
+
+# æŒ‡é’ˆå’Œå¼•ç”¨çš„å¯¹é½: Left, Right, Middle
+PointerAlignment: Right
+
+# å…è®¸é‡æ–°æŽ’ç‰ˆæ³¨é‡Š
+ReflowComments: true
+
+# å…è®¸æŽ’åº#include
+SortIncludes: false
+
+# å…è®¸æŽ’åº using å£°æ˜Ž
+SortUsingDeclarations: false
+
+# åœ¨Cé£Žæ ¼ç±»åž‹è½¬æ¢åŽæ·»åŠ ç©ºæ ¼
+SpaceAfterCStyleCast: false
+
+# åœ¨Template å…³é”®å­—åŽé¢æ·»åŠ ç©ºæ ¼
+SpaceAfterTemplateKeyword: true
+
+# åœ¨èµ‹å€¼è¿ç®—ç¬¦ä¹‹å‰æ·»åŠ ç©ºæ ¼
+SpaceBeforeAssignmentOperators: true
+
+# SpaceBeforeCpp11BracedList: true
+
+# SpaceBeforeCtorInitializerColon: true
+
+# SpaceBeforeInheritanceColon: true
+
+# å¼€åœ†æ‹¬å·ä¹‹å‰æ·»åŠ ä¸€ä¸ªç©ºæ ¼: Never, ControlStatements, Always
+SpaceBeforeParens: ControlStatements
+
+# SpaceBeforeRangeBasedForLoopColon: true
+
+# åœ¨ç©ºçš„åœ†æ‹¬å·ä¸­æ·»åŠ ç©ºæ ¼
+SpaceInEmptyParentheses: false
+
+# åœ¨å°¾éšçš„è¯„è®ºå‰æ·»åŠ çš„ç©ºæ ¼æ•°(åªé€‚ç”¨äºŽ//)
+SpacesBeforeTrailingComments: 1
+
+# åœ¨å°–æ‹¬å·çš„<åŽå’Œ>å‰æ·»åŠ ç©ºæ ¼
+SpacesInAngles: false
+
+# åœ¨Cé£Žæ ¼ç±»åž‹è½¬æ¢çš„æ‹¬å·ä¸­æ·»åŠ ç©ºæ ¼
+SpacesInCStyleCastParentheses: false
+
+# åœ¨å®¹å™¨(ObjCå’ŒJavaScriptçš„æ•°ç»„å’Œå­—å…¸ç­‰)å­—é¢é‡ä¸­æ·»åŠ ç©ºæ ¼
+SpacesInContainerLiterals: true
+
+# åœ¨åœ†æ‹¬å·çš„(åŽå’Œ)å‰æ·»åŠ ç©ºæ ¼
+SpacesInParentheses: false
+
+# åœ¨æ–¹æ‹¬å·çš„[åŽå’Œ]å‰æ·»åŠ ç©ºæ ¼ï¼Œlamdaè¡¨è¾¾å¼å’ŒæœªæŒ‡æ˜Žå¤§å°çš„æ•°ç»„çš„å£°æ˜Žä¸å—å½±å“
+SpacesInSquareBrackets: false
+
+# æ ‡å‡†: Cpp03, Cpp11, Auto
+Standard: Cpp11
+
+# tabå®½åº¦
+TabWidth: 4
+
+# ä½¿ç”¨tabå­—ç¬¦: Never, ForIndentation, ForContinuationAndIndentation, Always
+UseTab: Never
\ No newline at end of file
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..6aec5d3 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,9 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int idx = blockDim.x * blockIdx.x + threadIdx.x;
+            if (idx >= n) return;
+            bools[idx] = idata[idx] == 0 ? 0 : 1;
         }
 
         /**
@@ -33,6 +36,11 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int idx = blockDim.x * blockIdx.x + threadIdx.x;
+            if (idx >= n) return;
+            if (bools[idx] != 0) {
+                odata[indices[idx]] = idata[idx];
+            }
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..7d6bb41 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -20,6 +20,10 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -31,8 +35,14 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int pos = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[pos++] = idata[i];
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return pos;
         }
 
         /**
@@ -41,10 +51,30 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
+            int *flags = new int[n];
+            int *sum = new int[n];
+            int cnt = 0;
+            sum[0] = 0;
             timer().startCpuTimer();
             // TODO
+            for (int i = 0; i < n; i++) {
+                flags[i] = (idata[i] == 0 ? 0 : 1);
+            }
+            // scan (prefix sum)
+            for (int i = 1; i < n; i++) {
+                sum[i] = sum[i - 1] + flags[i];
+            }
+            // stream compaction
+            for (int i = 0; i < n; i++) {
+                if (flags[i] == 1) {
+                    odata[sum[i]] = idata[i];
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            cnt = sum[n - 1] + 1;
+            delete[] flags;
+            delete[] sum;
+            return cnt;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..0fa29d8 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,6 +3,16 @@
 #include "common.h"
 #include "efficient.h"
 
+const int blockSize = 128;
+
+__device__ inline int twoPow(int d) {
+    return (1 << (d));
+}
+
+inline int twoPowHost(int d) {
+    return (1 << (d));
+}
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,13 +22,98 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernUpSweep(int n, int d, int *x) {
+            int idx = blockDim.x * blockIdx.x + threadIdx.x;
+            if (idx >= n) return;
+            if (idx % twoPow(d + 1) == 0)
+                x[idx + twoPow(d + 1) - 1] += x[idx + twoPow(d) - 1];
+        }
+
+        __global__ void kernDownSweep(int n, int d, int *x) {
+            int idx = blockDim.x * blockIdx.x + threadIdx.x;
+            if (idx >= n) return;
+            if (idx % twoPow(d + 1) == 0) {
+                int tmp = x[idx + twoPow(d) - 1];
+                x[idx + twoPow(d) - 1] = x[idx + twoPow(d + 1) - 1];
+                x[idx + twoPow(d + 1) - 1] += tmp;
+            }
+        }
+
+        /**
+         * ÔÚÕâ¸öÊµÏÖÖÐ£¬¶ÔÓÚÊäÈëÊý×éµÄ³¤¶È²»ÊÇ2µÄÃÝ´ÎµÄÇé¿ö£¬»á½«ÆäÀ©Õ¹µ½×îÐ¡µÄ2µÄÃÝ´Î´óÐ¡£¬
+         * ÕâÑù×öµÄºÃ´¦ÊÇ¿ÉÒÔ½«ÊäÈëÊý×é·Ö³É¹æÄ£ÏàÍ¬µÄ×ÓÊý×é£¬±ãÓÚ²¢ÐÐ¼ÆËã¡£
+         * ÔÚ¼ÆËãÍêÇ°×ººÍºó£¬°Ñ¶àÓàµÄ²¿·Ö£¨¼´×îºóµÄ3¸öÔªËØ£©ÖÃÎª0¼´¿É¡£
+         *
+         * ¾ßÌåÀ´Ëµ£¬ÔÚUpSweep½×¶Î£¬Ã¿¸öÏß³Ì´¦ÀíÒ»¸öÊý×éÔªËØ£¬
+         * Èç¹ûÕâ¸öÔªËØµÄÏÂ±êÂú×ãidx % 2^(d+1) == 0£¬
+         * Ôò½«Õâ¸öÔªËØµÄÖµ¼ÓÉÏËüÇ°Ãæ¾àÀëËü2^d¸öÔªËØµÄÔªËØµÄÖµ¡£
+         * ÕâÑù¾Í½«Ã¿¸ö¾àÀëÎª2^dµÄÔªËØ¶ÔÓ¦µÄºÍ¼ÆËã³öÀ´¡£
+         * Õâ¸ö¹ý³ÌÒ»¹²Ö´ÐÐlog2(size)´Î£¬Ã¿Ò»´Î´¦ÀíµÄ¾àÀë¶¼ÊÇÉÏÒ»´ÎµÄÁ½±¶¡£
+         *
+         * ÔÚDownSweep½×¶Î£¬ÏÈ½«×îºóÒ»¸öÔªËØÖÃÎª0£¬È»ºó´Ó×îºóÒ»²ã¿ªÊ¼£¬
+         * Ã¿¸öÏß³Ì´¦ÀíÒ»¸öÊý×éÔªËØ£¬Èç¹ûÕâ¸öÔªËØµÄÏÂ±êÂú×ãidx % 2^(d+1) == 0£¬
+         * Ôò½«Õâ¸öÔªËØµÄÖµºÍËüÇ°Ãæ¾àÀëËü2^d¸öÔªËØµÄÔªËØµÄÖµ½»»»£¬²¢½«Ç°ÃæµÄÖµ¼Óµ½ºóÃæµÄÖµÉÏ¡£
+         * ÕâÑù¾Í½«Ã¿¸ö¾àÀëÎª2^dµÄÔªËØ¶ÔÓ¦µÄºÍ´ÓÏÂÍùÉÏ´«µÝ¡£
+         * Í¬Ñù£¬Õâ¸ö¹ý³ÌÒ²ÊÇÖ´ÐÐlog2(size)´Î¡£×îºó£¬Õû¸öÊý×éµÄÇ°×ººÍ¾Í¼ÆËãÍê³ÉÁË¡£
+         */
+
+        /**
+         * In this implementation, if the length of the input array is not a power of 2,
+         * it will be extended to the smallest power of 2 size.
+         * This is done to facilitate parallel computation by dividing the input array into equally-sized subarrays.
+         * After computing the prefix sum, the excess part of the array (i.e., the last 3 elements) is set to 0.
+         *
+         * Specifically, in the UpSweep phase, each thread processes one element of the array.
+         * If the index of this element satisfies idx % 2^(d+1) == 0,
+         * then the value of this element is added to the value of the element located 2^d positions in front of it.
+         * This way, the sums of every two elements that are 2^d apart are calculated. This process is repeated log2(size) times,
+         * where each iteration processes elements that are twice as far apart as the previous iteration.
+         *
+         * In the DownSweep phase, the last element of the array is set to 0.
+         * Starting from the last level, each thread processes one element of the array.
+         * If the index of this element satisfies idx % 2^(d+1) == 0,
+         * then the value of this element is swapped with the value of the element located 2^d positions in front of it,
+         * and the value of the latter element is added to the former element.
+         * This way, the sums of every two elements that are 2^d apart are propagated upwards from the bottom of the array.
+         * Again, this process is repeated log2(size) times.
+         * Finally, the prefix sum of the entire array is computed.
+         */
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int size = twoPowHost(ilog2ceil(n)); // ensure the size is pow of 2
+            // for example:
+            // if n = 253, let size equal to 256.
+            // ilog2ceil(253) = [log2(253)] + 1 = log2(128) + 1 = 8
+            // twoPowHost(8) = 256
+            dim3 blockPerGrids((size + blockSize - 1) / blockSize);
+            int *dev_idata;
+
+            cudaMalloc((void **)&dev_idata, size * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
             // TODO
+            // UpSweep
+            for (int d = 0; d < ilog2ceil(size); d++) {
+                kernUpSweep<<<blockPerGrids, blockSize>>>(n, d, dev_idata);
+                cudaDeviceSynchronize(); // ensure that the previous cuda jobs have completed
+            }
+            // set the last value of dev_idata to zero
+            cudaMemset(dev_idata + size - 1, 0, sizeof(int));
+
+            // DownSweep
+            for (int d = ilog2ceil(size) - 1; d >= 0; d--) {
+                kernDownSweep<<<blockPerGrids, blockSize>>>(n, d, dev_idata);
+                cudaDeviceSynchronize();
+            }
             timer().endGpuTimer();
+            cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
         }
 
         /**
@@ -31,10 +126,54 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int *dev_bools;
+            int *dev_indices;
+            int *dev_idata;
+            int *dev_odata;
+            int size = twoPowHost(ilog2ceil(n));
+            int cnt = 0;
+
+            dim3 blockPerGrids((n + blockSize - 1) / blockSize);
+            dim3 fullBlockPerGrids((size + blockSize - 1) / blockSize);
+
+            cudaMalloc((void **)&dev_bools, size * sizeof(int));
+            cudaMalloc((void **)&dev_indices, size * sizeof(int));
+            cudaMalloc((void **)&dev_idata, size * sizeof(int));
+            cudaMalloc((void **)&dev_odata, size * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
             // TODO
+            Common::kernMapToBoolean<<<blockPerGrids, blockSize>>>(n, dev_bools, dev_idata);
+            cudaDeviceSynchronize();
+            cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+            // scan
+            for (int d = 0; d < ilog2ceil(size); d++) {
+                kernUpSweep<<<fullBlockPerGrids, blockSize>>>(n, d, dev_indices);
+                cudaDeviceSynchronize();
+            }
+
+            cudaMemset(dev_indices + size - 1, 0, sizeof(int));
+
+            for (int d = ilog2ceil(size) - 1; d >= 0; d--) {
+                kernDownSweep<<<fullBlockPerGrids, blockSize>>>(n, d, dev_indices);
+                cudaDeviceSynchronize();
+            }
+
+            Common::kernScatter<<<blockPerGrids, blockSize>>>(n, dev_odata, dev_idata, dev_bools, dev_indices);
             timer().endGpuTimer();
-            return -1;
+
+            cudaMemcpy(&cnt, dev_indices + size - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, dev_odata, cnt * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_indices);
+            cudaFree(dev_bools);
+
+            return cnt;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..b383717 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,12 @@
 #include "common.h"
 #include "naive.h"
 
+const int blockSize = 128;
+
+__device__ inline int twoPow(int d) {
+    return (1 << (d));
+}
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -13,13 +19,46 @@ namespace StreamCompaction {
         }
         // TODO: __global__
 
+        __global__ void kernNaiveScan(int n, int d, int *odata, int *idata) {
+            int idx = blockDim.x * blockIdx.x + threadIdx.x;
+            if (idx >= n) return;
+            // Add adjacent elements to get the prefix sum
+            if (idx >= twoPow(d - 1)) 
+                odata[idx] = idata[idx - twoPow(d - 1)] + idata[idx];
+            else
+                odata[idx] = idata[idx];
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int *dev_idata;
+            int *dev_odata;
+            dim3 blocksPerGrid((n + blockSize - 1) / blockSize);
+            // allocate
+            cudaMalloc((void **)&dev_idata, n * sizeof(int));
+            checkCUDAError("allcoate dev_idata failed!\n");
+            cudaMalloc((void **)&dev_odata, n * sizeof(int));
+            checkCUDAError("allcoate dev_odata failed!\n");
+
+            // move data to device
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            
             timer().startGpuTimer();
-            // TODO
+            // TODO: Naive Scan
+            for (int d = 1; d <= ilog2ceil(n); d++) {
+                kernNaiveScan<<<blocksPerGrid, blockSize>>>(n, d, dev_odata, dev_idata);
+                std::swap(dev_odata, dev_idata);
+            }
             timer().endGpuTimer();
+
+            // shift right
+            odata[0] = 0;
+            cudaMemcpy(odata + 1, dev_idata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..810d7ca 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -22,7 +22,11 @@ namespace StreamCompaction {
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::device_vector<int> dv_in(idata, idata + n);
+            thrust::device_vector<int> dv_out(n);
+            thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             timer().endGpuTimer();
+            thrust::copy(dv_out.begin(), dv_out.end(), odata);
         }
     }
 }

From 39e4649cfeb3b10f43396b6d725128ed69fa2a17 Mon Sep 17 00:00:00 2001
From: miaokeda <inverse_boom@outlook.com>
Date: Mon, 17 Apr 2023 20:14:17 +0800
Subject: [PATCH 2/2] init

---
 stream_compaction/cpu.cu    | 20 ++++++++++----------
 stream_compaction/naive.cu  |  6 +++---
 stream_compaction/thrust.cu |  4 ++--
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 7d6bb41..83796b0 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -51,29 +51,29 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
-            int *flags = new int[n];
-            int *sum = new int[n];
+            int *bools = new int[n];
+            int *indices = new int[n];
             int cnt = 0;
-            sum[0] = 0;
+            indices[0] = 0;
             timer().startCpuTimer();
             // TODO
             for (int i = 0; i < n; i++) {
-                flags[i] = (idata[i] == 0 ? 0 : 1);
+                bools[i] = (idata[i] == 0 ? 0 : 1);
             }
             // scan (prefix sum)
             for (int i = 1; i < n; i++) {
-                sum[i] = sum[i - 1] + flags[i];
+                indices[i] = indices[i - 1] + bools[i];
             }
             // stream compaction
             for (int i = 0; i < n; i++) {
-                if (flags[i] == 1) {
-                    odata[sum[i]] = idata[i];
+                if (bools[i] == 1) {
+                    odata[indices[i]] = idata[i];
                 }
             }
             timer().endCpuTimer();
-            cnt = sum[n - 1] + 1;
-            delete[] flags;
-            delete[] sum;
+            cnt = indices[n - 1] + 1;
+            delete[] bools;
+            delete[] indices;
             return cnt;
         }
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index b383717..9b68b79 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -23,8 +23,8 @@ namespace StreamCompaction {
             int idx = blockDim.x * blockIdx.x + threadIdx.x;
             if (idx >= n) return;
             // Add adjacent elements to get the prefix sum
-            if (idx >= twoPow(d - 1)) 
-                odata[idx] = idata[idx - twoPow(d - 1)] + idata[idx];
+            if (idx >= twoPow(d))
+                odata[idx] = idata[idx] + idata[idx - twoPow(d)];
             else
                 odata[idx] = idata[idx];
         }
@@ -47,7 +47,7 @@ namespace StreamCompaction {
             
             timer().startGpuTimer();
             // TODO: Naive Scan
-            for (int d = 1; d <= ilog2ceil(n); d++) {
+            for (int d = 0; d < ilog2ceil(n); d++) {
                 kernNaiveScan<<<blocksPerGrid, blockSize>>>(n, d, dev_odata, dev_idata);
                 std::swap(dev_odata, dev_idata);
             }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 810d7ca..7cb6311 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,12 +18,12 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            thrust::device_vector<int> dv_in(idata, idata + n);
+            thrust::device_vector<int> dv_out(n);
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-            thrust::device_vector<int> dv_in(idata, idata + n);
-            thrust::device_vector<int> dv_out(n);
             thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             timer().endGpuTimer();
             thrust::copy(dv_out.begin(), dv_out.end(), odata);