From 0fd35de86cbee114f78404b8ea2d72ba031754a6 Mon Sep 17 00:00:00 2001 From: miaokeda Date: Sun, 16 Apr 2023 00:24:04 +0800 Subject: [PATCH 1/2] updated all todos --- .clang-format | 213 +++++++++++++++++++++++++++++++++ stream_compaction/common.cu | 8 ++ stream_compaction/cpu.cu | 34 +++++- stream_compaction/efficient.cu | 141 +++++++++++++++++++++- stream_compaction/naive.cu | 41 ++++++- stream_compaction/thrust.cu | 4 + 6 files changed, 437 insertions(+), 4 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..e295476 --- /dev/null +++ b/.clang-format @@ -0,0 +1,213 @@ +# 语言: None, Cpp, Java, JavaScript, ObjC, Proto, TableGen, TextProto +Language: Cpp +# BasedOnStyle: LLVM + +# 访问说明符(public、private等)的偏移 +AccessModifierOffset: -4 + +# 开括号(开圆括号、开尖括号、开方括号)后的对齐: Align, DontAlign, AlwaysBreak(总是在开括号后换行) +AlignAfterOpenBracket: Align + +# 连续赋值时,对齐所有等号 +AlignConsecutiveAssignments: false + +# 连续声明时,对齐所有声明的变量名 +AlignConsecutiveDeclarations: false + +# 右对齐逃脱换行(使用反斜杠换行)的反斜杠 +AlignEscapedNewlines: Right + +# 水平对齐二元和三元表达式的操作数 +AlignOperands: true + +# 对齐连续的尾随的注释 +AlignTrailingComments: true + +# 不允许函数声明的所有参数在放在下一行 +AllowAllParametersOfDeclarationOnNextLine: false + +# 不允许短的块放在同一行 +AllowShortBlocksOnASingleLine: true + +# 允许短的case标签放在同一行 +AllowShortCaseLabelsOnASingleLine: true + +# 允许短的函数放在同一行: None, InlineOnly(定义在类中), Empty(空函数), Inline(定义在类中,空函数), All +AllowShortFunctionsOnASingleLine: None + +# 允许短的if语句保持在同一行 +AllowShortIfStatementsOnASingleLine: true + +# 允许短的循环保持在同一行 +AllowShortLoopsOnASingleLine: true + +# 总是在返回类型后换行: None, All, TopLevel(顶级函数,不包括在类中的函数), +# AllDefinitions(所有的定义,不包括声明), TopLevelDefinitions(所有的顶级函数的定义) +AlwaysBreakAfterReturnType: None + +# 总是在多行string字面量前换行 +AlwaysBreakBeforeMultilineStrings: false + +# 总是在template声明后换行 +AlwaysBreakTemplateDeclarations: true + +# false表示函数实参要么都在同一行,要么都各自一行 +BinPackArguments: true + +# false表示所有形参要么都在同一行,要么都各自一行 +BinPackParameters: true + +# 大括号换行,只有当BreakBeforeBraces设置为Custom时才有效 +BraceWrapping: + # class定义后面 + AfterClass: false + # 控制语句后面 + AfterControlStatement: false + # enum定义后面 + AfterEnum: false + # 函数定义后面 + AfterFunction: false + # 命名空间定义后面 + AfterNamespace: false + # struct定义后面 + AfterStruct: false + # union定义后面 + AfterUnion: false + # extern之后 + AfterExternBlock: false + # catch之前 + BeforeCatch: false + # else之前 + BeforeElse: false + # 缩进大括号 + IndentBraces: false + # 分离空函数 + SplitEmptyFunction: false + # 分离空语句 + SplitEmptyRecord: false + # 分离空命名空间 + SplitEmptyNamespace: false + +# 在二元运算符前换行: None(在操作符后换行), NonAssignment(在非赋值的操作符前换行), All(在操作符前换行) +BreakBeforeBinaryOperators: NonAssignment + +# 在大括号前换行: Attach(始终将大括号附加到周围的上下文), Linux(除函数、命名空间和类定义,与Attach类似), +# Mozilla(除枚举、函数、记录定义,与Attach类似), Stroustrup(除函数定义、catch、else,与Attach类似), +# Allman(总是在大括号前换行), GNU(总是在大括号前换行,并对于控制语句的大括号增加额外的缩进), WebKit(在函数前换行), Custom +# 注:这里认为语句块也属于函数 +BreakBeforeBraces: Custom + +# 在三元运算符前换行 +BreakBeforeTernaryOperators: false + +# 在构造函数的初始化列表的冒号后换行 +BreakConstructorInitializers: AfterColon + +#BreakInheritanceList: AfterColon + +BreakStringLiterals: false + +# 每行字符的限制,0表示没有限制 +ColumnLimit: 0 + +CompactNamespaces: true + +# 构造函数的初始化列表要么都在同一行,要么都各自一行 +ConstructorInitializerAllOnOneLineOrOnePerLine: false + +# 构造函数的初始化列表的缩进宽度 +ConstructorInitializerIndentWidth: 4 + +# 延续的行的缩进宽度 +ContinuationIndentWidth: 4 + +# 去除C++11的列表初始化的大括号{后和}前的空格 +Cpp11BracedListStyle: true + +# 继承最常用的指针和引用的对齐方式 +DerivePointerAlignment: false + +# 固定命名空间注释 +FixNamespaceComments: true + +# 缩进case标签 +IndentCaseLabels: false + +IndentPPDirectives: None + +# 缩进宽度 +IndentWidth: 4 + +# 函数返回类型换行时,缩进函数声明或函数定义的函数名 +IndentWrappedFunctionNames: false + +# 保留在块开始处的空行 +KeepEmptyLinesAtTheStartOfBlocks: false + +# 连续空行的最大数量 +MaxEmptyLinesToKeep: 1 + +# 命名空间的缩进: None, Inner(缩进嵌套的命名空间中的内容), All +NamespaceIndentation: None + +# 指针和引用的对齐: Left, Right, Middle +PointerAlignment: Right + +# 允许重新排版注释 +ReflowComments: true + +# 允许排序#include +SortIncludes: false + +# 允许排序 using 声明 +SortUsingDeclarations: false + +# 在C风格类型转换后添加空格 +SpaceAfterCStyleCast: false + +# 在Template 关键字后面添加空格 +SpaceAfterTemplateKeyword: true + +# 在赋值运算符之前添加空格 +SpaceBeforeAssignmentOperators: true + +# SpaceBeforeCpp11BracedList: true + +# SpaceBeforeCtorInitializerColon: true + +# SpaceBeforeInheritanceColon: true + +# 开圆括号之前添加一个空格: Never, ControlStatements, Always +SpaceBeforeParens: ControlStatements + +# SpaceBeforeRangeBasedForLoopColon: true + +# 在空的圆括号中添加空格 +SpaceInEmptyParentheses: false + +# 在尾随的评论前添加的空格数(只适用于//) +SpacesBeforeTrailingComments: 1 + +# 在尖括号的<后和>前添加空格 +SpacesInAngles: false + +# 在C风格类型转换的括号中添加空格 +SpacesInCStyleCastParentheses: false + +# 在容器(ObjC和JavaScript的数组和字典等)字面量中添加空格 +SpacesInContainerLiterals: true + +# 在圆括号的(后和)前添加空格 +SpacesInParentheses: false + +# 在方括号的[后和]前添加空格,lamda表达式和未指明大小的数组的声明不受影响 +SpacesInSquareBrackets: false + +# 标准: Cpp03, Cpp11, Auto +Standard: Cpp11 + +# tab宽度 +TabWidth: 4 + +# 使用tab字符: Never, ForIndentation, ForContinuationAndIndentation, Always +UseTab: Never \ No newline at end of file diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..6aec5d3 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,9 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx >= n) return; + bools[idx] = idata[idx] == 0 ? 0 : 1; } /** @@ -33,6 +36,11 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx >= n) return; + if (bools[idx] != 0) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..7d6bb41 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -31,8 +35,14 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int pos = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[pos++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return pos; } /** @@ -41,10 +51,30 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int *flags = new int[n]; + int *sum = new int[n]; + int cnt = 0; + sum[0] = 0; timer().startCpuTimer(); // TODO + for (int i = 0; i < n; i++) { + flags[i] = (idata[i] == 0 ? 0 : 1); + } + // scan (prefix sum) + for (int i = 1; i < n; i++) { + sum[i] = sum[i - 1] + flags[i]; + } + // stream compaction + for (int i = 0; i < n; i++) { + if (flags[i] == 1) { + odata[sum[i]] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + cnt = sum[n - 1] + 1; + delete[] flags; + delete[] sum; + return cnt; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..0fa29d8 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,16 @@ #include "common.h" #include "efficient.h" +const int blockSize = 128; + +__device__ inline int twoPow(int d) { + return (1 << (d)); +} + +inline int twoPowHost(int d) { + return (1 << (d)); +} + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +22,98 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int d, int *x) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx >= n) return; + if (idx % twoPow(d + 1) == 0) + x[idx + twoPow(d + 1) - 1] += x[idx + twoPow(d) - 1]; + } + + __global__ void kernDownSweep(int n, int d, int *x) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx >= n) return; + if (idx % twoPow(d + 1) == 0) { + int tmp = x[idx + twoPow(d) - 1]; + x[idx + twoPow(d) - 1] = x[idx + twoPow(d + 1) - 1]; + x[idx + twoPow(d + 1) - 1] += tmp; + } + } + + /** + * ʵУijȲ2ݴεὫչС2ݴδС + * ĺôǿԽֳɹģͬ飬ڲм㡣 + * ڼǰ׺ͺ󣬰ѶIJ֣3ԪأΪ0ɡ + * + * ˵UpSweep׶Σÿ̴߳һԪأ + * Ԫص±idx % 2^(d+1) == 0 + * Ԫصֵǰ2^dԪصԪصֵ + * ͽÿΪ2^dԪضӦĺͼ + * һִlog2(size)Σÿһδľ붼һε + * + * DownSweep׶ΣȽһԪΪ0Ȼһ㿪ʼ + * ÿ̴߳һԪأԪص±idx % 2^(d+1) == 0 + * Ԫصֵǰ2^dԪصԪصֵǰֵӵֵϡ + * ͽÿΪ2^dԪضӦĺʹϴݡ + * ͬҲִlog2(size)Ρǰ׺;ͼˡ + */ + + /** + * In this implementation, if the length of the input array is not a power of 2, + * it will be extended to the smallest power of 2 size. + * This is done to facilitate parallel computation by dividing the input array into equally-sized subarrays. + * After computing the prefix sum, the excess part of the array (i.e., the last 3 elements) is set to 0. + * + * Specifically, in the UpSweep phase, each thread processes one element of the array. + * If the index of this element satisfies idx % 2^(d+1) == 0, + * then the value of this element is added to the value of the element located 2^d positions in front of it. + * This way, the sums of every two elements that are 2^d apart are calculated. This process is repeated log2(size) times, + * where each iteration processes elements that are twice as far apart as the previous iteration. + * + * In the DownSweep phase, the last element of the array is set to 0. + * Starting from the last level, each thread processes one element of the array. + * If the index of this element satisfies idx % 2^(d+1) == 0, + * then the value of this element is swapped with the value of the element located 2^d positions in front of it, + * and the value of the latter element is added to the former element. + * This way, the sums of every two elements that are 2^d apart are propagated upwards from the bottom of the array. + * Again, this process is repeated log2(size) times. + * Finally, the prefix sum of the entire array is computed. + */ + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int size = twoPowHost(ilog2ceil(n)); // ensure the size is pow of 2 + // for example: + // if n = 253, let size equal to 256. + // ilog2ceil(253) = [log2(253)] + 1 = log2(128) + 1 = 8 + // twoPowHost(8) = 256 + dim3 blockPerGrids((size + blockSize - 1) / blockSize); + int *dev_idata; + + cudaMalloc((void **)&dev_idata, size * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + // UpSweep + for (int d = 0; d < ilog2ceil(size); d++) { + kernUpSweep<<>>(n, d, dev_idata); + cudaDeviceSynchronize(); // ensure that the previous cuda jobs have completed + } + // set the last value of dev_idata to zero + cudaMemset(dev_idata + size - 1, 0, sizeof(int)); + + // DownSweep + for (int d = ilog2ceil(size) - 1; d >= 0; d--) { + kernDownSweep<<>>(n, d, dev_idata); + cudaDeviceSynchronize(); + } timer().endGpuTimer(); + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); } /** @@ -31,10 +126,54 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int *dev_bools; + int *dev_indices; + int *dev_idata; + int *dev_odata; + int size = twoPowHost(ilog2ceil(n)); + int cnt = 0; + + dim3 blockPerGrids((n + blockSize - 1) / blockSize); + dim3 fullBlockPerGrids((size + blockSize - 1) / blockSize); + + cudaMalloc((void **)&dev_bools, size * sizeof(int)); + cudaMalloc((void **)&dev_indices, size * sizeof(int)); + cudaMalloc((void **)&dev_idata, size * sizeof(int)); + cudaMalloc((void **)&dev_odata, size * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + cudaDeviceSynchronize(); + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + + // scan + for (int d = 0; d < ilog2ceil(size); d++) { + kernUpSweep<<>>(n, d, dev_indices); + cudaDeviceSynchronize(); + } + + cudaMemset(dev_indices + size - 1, 0, sizeof(int)); + + for (int d = ilog2ceil(size) - 1; d >= 0; d--) { + kernDownSweep<<>>(n, d, dev_indices); + cudaDeviceSynchronize(); + } + + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); timer().endGpuTimer(); - return -1; + + cudaMemcpy(&cnt, dev_indices + size - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_odata, cnt * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_indices); + cudaFree(dev_bools); + + return cnt; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..b383717 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,12 @@ #include "common.h" #include "naive.h" +const int blockSize = 128; + +__device__ inline int twoPow(int d) { + return (1 << (d)); +} + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -13,13 +19,46 @@ namespace StreamCompaction { } // TODO: __global__ + __global__ void kernNaiveScan(int n, int d, int *odata, int *idata) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx >= n) return; + // Add adjacent elements to get the prefix sum + if (idx >= twoPow(d - 1)) + odata[idx] = idata[idx - twoPow(d - 1)] + idata[idx]; + else + odata[idx] = idata[idx]; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *dev_idata; + int *dev_odata; + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + // allocate + cudaMalloc((void **)&dev_idata, n * sizeof(int)); + checkCUDAError("allcoate dev_idata failed!\n"); + cudaMalloc((void **)&dev_odata, n * sizeof(int)); + checkCUDAError("allcoate dev_odata failed!\n"); + + // move data to device + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + // TODO: Naive Scan + for (int d = 1; d <= ilog2ceil(n); d++) { + kernNaiveScan<<>>(n, d, dev_odata, dev_idata); + std::swap(dev_odata, dev_idata); + } timer().endGpuTimer(); + + // shift right + odata[0] = 0; + cudaMemcpy(odata + 1, dev_idata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..810d7ca 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -22,7 +22,11 @@ namespace StreamCompaction { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } } From 39e4649cfeb3b10f43396b6d725128ed69fa2a17 Mon Sep 17 00:00:00 2001 From: miaokeda Date: Mon, 17 Apr 2023 20:14:17 +0800 Subject: [PATCH 2/2] init --- stream_compaction/cpu.cu | 20 ++++++++++---------- stream_compaction/naive.cu | 6 +++--- stream_compaction/thrust.cu | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 7d6bb41..83796b0 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -51,29 +51,29 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - int *flags = new int[n]; - int *sum = new int[n]; + int *bools = new int[n]; + int *indices = new int[n]; int cnt = 0; - sum[0] = 0; + indices[0] = 0; timer().startCpuTimer(); // TODO for (int i = 0; i < n; i++) { - flags[i] = (idata[i] == 0 ? 0 : 1); + bools[i] = (idata[i] == 0 ? 0 : 1); } // scan (prefix sum) for (int i = 1; i < n; i++) { - sum[i] = sum[i - 1] + flags[i]; + indices[i] = indices[i - 1] + bools[i]; } // stream compaction for (int i = 0; i < n; i++) { - if (flags[i] == 1) { - odata[sum[i]] = idata[i]; + if (bools[i] == 1) { + odata[indices[i]] = idata[i]; } } timer().endCpuTimer(); - cnt = sum[n - 1] + 1; - delete[] flags; - delete[] sum; + cnt = indices[n - 1] + 1; + delete[] bools; + delete[] indices; return cnt; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index b383717..9b68b79 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -23,8 +23,8 @@ namespace StreamCompaction { int idx = blockDim.x * blockIdx.x + threadIdx.x; if (idx >= n) return; // Add adjacent elements to get the prefix sum - if (idx >= twoPow(d - 1)) - odata[idx] = idata[idx - twoPow(d - 1)] + idata[idx]; + if (idx >= twoPow(d)) + odata[idx] = idata[idx] + idata[idx - twoPow(d)]; else odata[idx] = idata[idx]; } @@ -47,7 +47,7 @@ namespace StreamCompaction { timer().startGpuTimer(); // TODO: Naive Scan - for (int d = 1; d <= ilog2ceil(n); d++) { + for (int d = 0; d < ilog2ceil(n); d++) { kernNaiveScan<<>>(n, d, dev_odata, dev_idata); std::swap(dev_odata, dev_idata); } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 810d7ca..7cb6311 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,12 +18,12 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - thrust::device_vector dv_in(idata, idata + n); - thrust::device_vector dv_out(n); thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); thrust::copy(dv_out.begin(), dv_out.end(), odata);