update

coderonion · coderonion · commit 8f5287189040 · 2024-02-24T10:21:26.000+08:00
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,12 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+indent_style = space
+indent_size = 4
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = false
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,24 @@
+# macOS
+.DS_Store
+
+# editor
+.vs/
+.vscode/
+.idea/
+
+# build
+cmake*/
+build/
+target/
+
+# files
+*.o
+*.a
+*.out
+*.dll
+*.lib
+*.obj
+*.exe
+*.lock
+.mdnice/
+*/*.md
diff --git a/README.md b/README.md
@@ -0,0 +1,90 @@
+# CUDA-Beginner-Course-CPP-Version
+# CUDA 12.1 并行编程入门(C++语言版)
+
+
+
+***Note that this repository is under active development.***
+
+
+
+## Progress
+| Section | Videos                                                       | Codes                                         |
+| :------ | :----------------------------------------------------------- | :-------------------------------------------- |
+| 01      | [第1集 NVIDIA CUDA介绍及Windows开发环境安装](https://www.bilibili.com/video/BV1Sj411H7Qq/) | 无                                            |
+| 02      | [第2集 Ubuntu系统下安装CUDA开发环境](https://www.bilibili.com/video/BV1je411U7yX/) | 无                                            |
+| 03      | [第3集 CUDA设备信息查看及跨平台编译第一个CUDA程序](https://www.bilibili.com/video/BV1oc411x7Gt/) | [course01_hello_cuda](./course01_hello_cuda/) |
+|         |                                                              |                                               |
+
+
+
+
+## Todo
+
+- [ ] ...
+- [ ] ...
+
+
+
+## Acknowledgements
+
+Thanks you for the following excellent public learning resources.
+
+- [codingonion/awesome-cuda-rocm-fpga](https://github.com/codingonion/awesome-cuda-rocm-fpga) <img src="https://img.shields.io/github/stars/codingonion/awesome-cuda-rocm-fpga?style=social"/> : A collection of some awesome public NVIDIA CUDA, CUDNN, TensorRT, AMD ROCm and FPGA projects.
+
+- [codingonion/cuda-beginner-course-cpp-version](https://github.com/codingonion/cuda-beginner-course-cpp-version) <img src="https://img.shields.io/github/stars/codingonion/cuda-beginner-course-cpp-version?style=social"/> : bilibili视频【CUDA 12.1 并行编程入门(C++语言版)】配套代码。
+
+- [codingonion/cuda-beginner-course-rust-version](https://github.com/codingonion/cuda-beginner-course-rust-version) <img src="https://img.shields.io/github/stars/codingonion/cuda-beginner-course-rust-version?style=social"/> : bilibili视频【CUDA 12.1 并行编程入门(Rust语言版)】配套代码。
+
+- [codingonion/cuda-beginner-course-python-version](https://github.com/codingonion/cuda-beginner-course-python-version) <img src="https://img.shields.io/github/stars/codingonion/cuda-beginner-course-python-version?style=social"/> : bilibili视频【CUDA 12.1 并行编程入门(Python语言版)】配套代码。
+
+- [NVIDIA CUDA Docs](https://docs.nvidia.com/cuda/) : CUDA Toolkit Documentation.
+
+- [NVIDIA/cuda-samples](https://github.com/NVIDIA/cuda-samples) <img src="https://img.shields.io/github/stars/NVIDIA/cuda-samples?style=social"/> : Samples for CUDA Developers which demonstrates features in CUDA Toolkit.
+
+- [NVIDIA/CUDALibrarySamples](https://github.com/NVIDIA/CUDALibrarySamples) <img src="https://img.shields.io/github/stars/NVIDIA/CUDALibrarySamples?style=social"/> : CUDA Library Samples.
+
+- [HeKun-NVIDIA/CUDA-Programming-Guide-in-Chinese](https://github.com/HeKun-NVIDIA/CUDA-Programming-Guide-in-Chinese) <img src="https://img.shields.io/github/stars/HeKun-NVIDIA/CUDA-Programming-Guide-in-Chinese?style=social"/> : This is a Chinese translation of the CUDA programming guide. 本项目为 CUDA C Programming Guide 的中文翻译版。
+
+- [brucefan1983/CUDA-Programming](https://github.com/brucefan1983/CUDA-Programming) <img src="https://img.shields.io/github/stars/brucefan1983/CUDA-Programming?style=social"/> : Sample codes for my CUDA programming book.
+
+- [YouQixiaowu/CUDA-Programming-with-Python](https://github.com/YouQixiaowu/CUDA-Programming-with-Python) <img src="https://img.shields.io/github/stars/YouQixiaowu/CUDA-Programming-with-Python?style=social"/> :  关于书籍CUDA Programming使用了pycuda模块的Python版本的示例代码。
+
+- [QINZHAOYU/CudaSteps](https://github.com/QINZHAOYU/CudaSteps) <img src="https://img.shields.io/github/stars/QINZHAOYU/CudaSteps?style=social"/> : 基于《cuda编程-基础与实践》（樊哲勇 著）的cuda学习之路。
+
+- [sangyc10/CUDA-code](https://github.com/sangyc10/CUDA-code) <img src="https://img.shields.io/github/stars/sangyc10/CUDA-code?style=social"/> : B站视频教程【CUDA编程基础入门系列（持续更新）】配套代码。
+
+- [RussWong/CUDATutorial](https://github.com/RussWong/CUDATutorial) <img src="https://img.shields.io/github/stars/RussWong/CUDATutorial?style=social"/> : A CUDA tutorial to make people learn CUDA program from 0.
+
+- [DefTruth/cuda-learn-note](https://github.com/DefTruth/cuda-learn-note) <img src="https://img.shields.io/github/stars/DefTruth/cuda-learn-note?style=social"/> : 🎉CUDA 笔记 / 高频面试题汇总 / C++笔记，个人笔记，更新随缘: sgemm、sgemv、warp reduce、block reduce、dot product、elementwise、softmax、layernorm、rmsnorm、hist etc.
+
+- [Liu-xiandong/How_to_optimize_in_GPU](https://github.com/Liu-xiandong/How_to_optimize_in_GPU) <img src="https://img.shields.io/github/stars/Liu-xiandong/How_to_optimize_in_GPU?style=social"/> : This is a series of GPU optimization topics. Here we will introduce how to optimize the CUDA kernel in detail. I will introduce several basic kernel optimizations, including: elementwise, reduce, sgemv, sgemm, etc. The performance of these kernels is basically at or near the theoretical limit.
+
+- [enp1s0/ozIMMU](https://github.com/enp1s0/ozIMMU) <img src="https://img.shields.io/github/stars/enp1s0/ozIMMU?style=social"/> : FP64 equivalent GEMM via Int8 Tensor Cores using the Ozaki scheme. [arxiv.org/abs/2306.11975](https://arxiv.org/abs/2306.11975)
+
+- [Bruce-Lee-LY/matrix_multiply](https://github.com/Bruce-Lee-LY/matrix_multiply) <img src="https://img.shields.io/github/stars/Bruce-Lee-LY/matrix_multiply?style=social"/> : Several common methods of matrix multiplication are implemented on CPU and Nvidia GPU using C++11 and CUDA.
+
+- [Bruce-Lee-LY/cuda_hgemm](https://github.com/Bruce-Lee-LY/cuda_hgemm) <img src="https://img.shields.io/github/stars/Bruce-Lee-LY/cuda_hgemm?style=social"/> : Several optimization methods of half-precision general matrix multiplication (HGEMM) using tensor core with WMMA API and MMA PTX instruction.
+
+- [Bruce-Lee-LY/cuda_hgemv](https://github.com/Bruce-Lee-LY/cuda_hgemv) <img src="https://img.shields.io/github/stars/Bruce-Lee-LY/cuda_hgemv?style=social"/> : Several optimization methods of half-precision general matrix vector multiplication (HGEMV) using CUDA core.
+
+- [Cjkkkk/CUDA_gemm](https://github.com/Cjkkkk/CUDA_gemm) <img src="https://img.shields.io/github/stars/Cjkkkk/CUDA_gemm?style=social"/> : A simple high performance CUDA GEMM implementation.
+
+- [AyakaGEMM/Hands-on-GEMM](https://github.com/AyakaGEMM/Hands-on-GEMM) <img src="https://img.shields.io/github/stars/AyakaGEMM/Hands-on-GEMM?style=social"/> : A GEMM tutorial.
+
+- [zpzim/MSplitGEMM](https://github.com/zpzim/MSplitGEMM) <img src="https://img.shields.io/github/stars/zpzim/MSplitGEMM?style=social"/> : Large matrix multiplication in CUDA.
+
+- [jundaf2/CUDA-INT8-GEMM](https://github.com/jundaf2/CUDA-INT8-GEMM) <img src="https://img.shields.io/github/stars/jundaf2/CUDA-INT8-GEMM?style=social"/> : CUDA 8-bit Tensor Core Matrix Multiplication based on m16n16k16 WMMA API.
+
+- [chanzhennan/cuda_gemm_benchmark](https://github.com/chanzhennan/cuda_gemm_benchmark) <img src="https://img.shields.io/github/stars/chanzhennan/cuda_gemm_benchmark?style=social"/> : Base on gtest/benchmark, refer to [https://github.com/Liu-xiandong/How_to_optimize_in_GPU](https://github.com/Liu-xiandong/How_to_optimize_in_GPU).
+
+- [YuxueYang1204/CudaDemo](https://github.com/YuxueYang1204/CudaDemo) <img src="https://img.shields.io/github/stars/YuxueYang1204/CudaDemo?style=social"/> : Implement custom operators in PyTorch with cuda/c++.
+
+- [CoffeeBeforeArch/cuda_programming](https://github.com/CoffeeBeforeArch/cuda_programming) <img src="https://img.shields.io/github/stars/CoffeeBeforeArch/cuda_programming?style=social"/> : Code from the "CUDA Crash Course" YouTube series by CoffeeBeforeArch.
+
+- [rbaygildin/learn-gpgpu](https://github.com/rbaygildin/learn-gpgpu) <img src="https://img.shields.io/github/stars/rbaygildin/learn-gpgpu?style=social"/> : Algorithms implemented in CUDA + resources about GPGPU.
+
+- [PacktPublishing/Learn-CUDA-Programming](https://github.com/PacktPublishing/Learn-CUDA-Programming) <img src="https://img.shields.io/github/stars/PacktPublishing/Learn-CUDA-Programming?style=social"/> : Learn CUDA Programming, published by Packt.
+
+- [PacktPublishing/Hands-On-GPU-Accelerated-Computer-Vision-with-OpenCV-and-CUDA](https://github.com/PacktPublishing/Hands-On-GPU-Accelerated-Computer-Vision-with-OpenCV-and-CUDA) <img src="https://img.shields.io/github/stars/PacktPublishing/Hands-On-GPU-Accelerated-Computer-Vision-with-OpenCV-and-CUDA?style=social"/> : Hands-On GPU Accelerated Computer Vision with OpenCV and CUDA, published by Packt.
+
+- [PacktPublishing/Hands-On-GPU-Programming-with-Python-and-CUDA](https://github.com/PacktPublishing/Hands-On-GPU-Programming-with-Python-and-CUDA) <img src="https://img.shields.io/github/stars/PacktPublishing/Hands-On-GPU-Programming-with-Python-and-CUDA?style=social"/> : Hands-On GPU Programming with Python and CUDA, published by Packt.
diff --git a/course01_hello_cuda/CMakeLists.txt b/course01_hello_cuda/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.22)
+project(course01_hello_cuda)
+
+add_definitions(-arch=sm_86)
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
+endif()
+enable_language(CUDA)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+set(CMAKE_CXX_FLAGS "-Wall -Wextra")
+set(CMAKE_CXX_FLAGS_DEBUG "-g")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+
+add_executable(course01_hello_cuda course01_hello_cuda.cu)
+set_target_properties(course01_hello_cuda PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON)
diff --git a/course01_hello_cuda/course01_hello_cuda.cu b/course01_hello_cuda/course01_hello_cuda.cu
@@ -0,0 +1,31 @@
+#include <cstdio>
+
+// CUDA核函数（CPU主机端调用，GPU设备端执行）
+// CUDA核函数标识符：__global__
+__global__ void hello_cuda_from_gpu() {
+    printf("GPU: 你好, CUDA! (C++版)\n");
+}
+
+// 普通函数（CPU主机端调用和执行）
+void hello_cuda_from_cpu() {
+    printf("CPU: 你好, CUDA! (C++版)\n");
+}
+
+int main() {
+    // GPU: 你好, CUDA! (C++版)
+    {
+        const int GRID_DIM = 2;          // grid网格大小（线程块数量）
+        const int BLOCK_DIM = 8;         // block线程块大小（每个线程块中的线程数量）
+        // CUDA核函数调用, 核函数配置参数<<<...>>>, 核函数总线程数为2*8=16
+        hello_cuda_from_gpu<<<GRID_DIM, BLOCK_DIM>>>();  // CUDA核函数调用
+        cudaDeviceSynchronize();        // 同步CPU主机端和GPU设备端
+    }
+    printf("\n");
+    // CPU: 你好, CUDA! (C++版)
+    {
+        for(int i = 0; i < 16; ++i) {
+            hello_cuda_from_cpu();
+        }
+    }
+    return 0;
+}