diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index 4d96581ad..ff86a3e32 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -13,9 +13,9 @@ jobs: strategy: matrix: cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 pool: name: mscclpp @@ -30,10 +30,8 @@ jobs: inputs: targetType: 'inline' script: | - curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp mkdir build && cd build - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. + cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. make -j workingDirectory: '$(System.DefaultWorkingDirectory)' @@ -112,3 +110,15 @@ jobs: set -e python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl workingDirectory: '$(System.DefaultWorkingDirectory)' + + - task: Bash@3 + name: PythonAllReduceBenchmark + displayName: Python Allreduce Benchmark + inputs: + targetType: 'inline' + script: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + python3 -m pip install . + mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py + workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index bb158646e..7c9d35094 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -10,9 +10,9 @@ jobs: strategy: matrix: cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 pool: name: mscclpp-it container: @@ -25,12 +25,9 @@ jobs: inputs: targetType: 'inline' script: | - curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp mkdir build && cd build - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON .. + cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. make -j - make pylib-copy workingDirectory: '$(System.DefaultWorkingDirectory)' - task: DownloadSecureFile@1 @@ -83,7 +80,7 @@ jobs: tail -f output/mscclit-000000 & CHILD_PID=$! parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mscclpp-test' + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test' kill $CHILD_PID - task: Bash@3 @@ -102,7 +99,7 @@ jobs: tail -f output/mscclit-000000 & CHILD_PID=$! parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mp-ut' + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut' kill $CHILD_PID - task: Bash@3 @@ -121,7 +118,26 @@ jobs: tail -f output/mscclit-000000 & CHILD_PID=$! parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ - -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh pytests' + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests' + kill $CHILD_PID + + - task: Bash@3 + name: RunMultiNodePythonBenchmark + displayName: Run multi-nodes python benchmark + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + rm -rf output/* + mkdir -p output + touch output/mscclit-000000 + tail -f output/mscclit-000000 & + CHILD_PID=$! + parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark' kill $CHILD_PID - task: AzureCLI@2 diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index 31b8091cd..78b679e8d 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -15,9 +15,9 @@ jobs: strategy: matrix: cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 container: image: $[ variables['containerImage'] ] @@ -30,10 +30,8 @@ jobs: inputs: targetType: 'inline' script: | - curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp mkdir build && cd build - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. + cmake -DCMAKE_BUILD_TYPE=Release .. make -j workingDirectory: '$(System.DefaultWorkingDirectory)' @@ -79,11 +77,5 @@ jobs: script: | set -e export PATH=/usr/local/mpi/bin:$PATH - cd build && make pylib-copy - if [[ '$(containerImage)' == *'cuda11'* ]]; then - pip3 install -r ../python/test/requirements_cu11.txt - else - pip3 install -r ../python/test/requirements_cu12.txt - fi - mpirun -tag-output -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x + mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.github/ISSUE_TEMPLATE/documentation-improvement.md b/.github/ISSUE_TEMPLATE/documentation-improvement.md new file mode 100644 index 000000000..e552d4db8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation-improvement.md @@ -0,0 +1,10 @@ +--- +name: Documentation improvement +about: Enhance or fix documentation +title: "[Doc]" +labels: '' +assignees: '' + +--- + + diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 2db0a91fb..7295171e9 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -13,7 +13,7 @@ jobs: name: Analyze runs-on: 'ubuntu-latest' container: - image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }} + image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda-version }} permissions: actions: read @@ -24,7 +24,7 @@ jobs: fail-fast: false matrix: language: [ 'cpp', 'python' ] - cuda-version: [ 'cuda11.8', 'cuda12.1' ] + cuda-version: [ 'cuda11.8', 'cuda12.2' ] steps: - name: Checkout repository @@ -45,7 +45,7 @@ jobs: - name: Build run: | - MPI_HOME=/usr/local/mpi cmake -DBYPASS_PEERMEM_CHECK=ON . + cmake -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON . make -j - name: Perform CodeQL Analysis diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml index 24dacf9ec..476ae8f76 100644 --- a/.github/workflows/integration-test-backup.yml +++ b/.github/workflows/integration-test-backup.yml @@ -10,10 +10,10 @@ jobs: shell: bash strategy: matrix: - cuda: [ cuda11.8, cuda12.1 ] + cuda: [ cuda11.8, cuda12.2 ] container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}" + image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 steps: @@ -23,7 +23,7 @@ jobs: - name: Build run: | mkdir build && cd build - MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release .. + cmake -DCMAKE_BUILD_TYPE=Release .. make -j - name: Lock GPU clock frequency diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index aaffe9578..0c1babcdd 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -20,7 +20,7 @@ jobs: - name: Run cpplint run: | - CPPSOURCES=$(find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*") + CPPSOURCES=$(find ./src ./include ./python ./test -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)') clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES} pylint: diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml index 620fe46c6..696266c49 100644 --- a/.github/workflows/ut-backup.yml +++ b/.github/workflows/ut-backup.yml @@ -11,10 +11,10 @@ jobs: timeout-minutes: 30 strategy: matrix: - cuda: [ cuda11.8, cuda12.1 ] + cuda: [ cuda11.8, cuda12.2 ] container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}" + image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 steps: @@ -29,7 +29,7 @@ jobs: - name: Build run: | mkdir build && cd build - MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release .. + cmake -DCMAKE_BUILD_TYPE=Release .. make -j working-directory: ${{ github.workspace }} @@ -54,11 +54,11 @@ jobs: - name: PyTests run: | set -e - cd build && make pylib-copy - mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x + mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x - name: ReportCoverage run: | + set -e cd build lcov --capture --directory . --output-file coverage.info lcov --remove coverage.info \ @@ -68,4 +68,4 @@ jobs: '*/test/*' \ '*/tools/*' \ --output-file coverage.info - lcov --list coverage.info + lcov --list coverage.info \ No newline at end of file diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 000000000..1b8c52dad --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,47 @@ +cff-version: 1.2.0 +title: "MSCCL++: A GPU-driven communication stack for scalable AI applications" +version: 0.4.2 +message: >- + If you use this project in your research, please cite it as below. +authors: + - given-names: Peng + family-names: Cheng + affiliation: Microsoft Research + - given-names: Changho + family-names: Hwang + affiliation: Microsoft Research + - given-names: Abhinav + family-names: Jangda + affiliation: Microsoft Research + - given-names: Suriya + family-names: Kalivardhan + affiliation: Microsoft Azure + - given-names: Binyang + family-names: Li + affiliation: Microsoft Azure + - given-names: Shuguang + family-names: Liu + affiliation: Microsoft Azure + - given-names: Saeed + family-names: Maleki + affiliation: Microsoft Research + - given-names: Madan + family-names: Musuvathi + affiliation: Microsoft Research + - given-names: Olli + family-names: Saarikivi + affiliation: Microsoft Research + - given-names: Wei + family-names: Tsui + affiliation: Microsoft Research + - given-names: Ziyue + family-names: Yang + affiliation: Microsoft Research + +repository-code: 'https://github.com/microsoft/mscclpp' +abstract: >- + MSCCL++ redefines the interface for inter-GPU communication, thereby + delivering a highly efficient and customizable communication stack + tailored for distributed GPU applications. +license: MIT +license-url: https://github.com/microsoft/mscclpp/blob/main/LICENSE diff --git a/CMakeLists.txt b/CMakeLists.txt index 982f9d568..6b90cbd86 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,18 +2,14 @@ # Licensed under the MIT license. set(MSCCLPP_MAJOR "0") -set(MSCCLPP_MINOR "3") -set(MSCCLPP_PATCH "0") +set(MSCCLPP_MINOR "4") +set(MSCCLPP_PATCH "2") set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR}) set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}") cmake_minimum_required(VERSION 3.25) -project(mscclpp LANGUAGES CUDA CXX) -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CUDA_STANDARD 17) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra") +enable_language(CXX) # Code coverage from https://github.com/codecov/example-cpp11-cmake add_library(coverage_config INTERFACE) @@ -27,75 +23,114 @@ if(CMAKE_BUILD_TYPE MATCHES "Debug" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang target_link_options(coverage_config INTERFACE --coverage) endif() -list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) - -# Format targets -include(${PROJECT_SOURCE_DIR}/cmake/AddFormatTargets.cmake) +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) # Options option(ENABLE_TRACE "Enable tracing" OFF) option(USE_NPKIT "Use NPKIT" ON) option(BUILD_TESTS "Build tests" ON) option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON) -option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF) -option(BYPASS_PEERMEM_CHECK "Bypass checking nvidia_peermem" OFF) +option(USE_CUDA "Use NVIDIA/CUDA." OFF) +option(USE_ROCM "Use AMD/ROCm." OFF) +option(BYPASS_GPU_CHECK "Bypass GPU check." OFF) + +if(BYPASS_GPU_CHECK) + if(USE_CUDA) + message("Bypassing GPU check: using NVIDIA/CUDA.") + find_package(CUDAToolkit REQUIRED) + elseif(USE_ROCM) + message("Bypassing GPU check: using AMD/ROCm.") + # Temporal fix for rocm5.6 + set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}") + find_package(hip REQUIRED) + else() + message(FATAL_ERROR "Bypassing GPU check: neither NVIDIA/CUDA nor AMD/ROCm is specified.") + endif() +else() + # Detect GPUs + include(CheckNvidiaGpu) + include(CheckAmdGpu) + if(NVIDIA_FOUND AND AMD_FOUND) + message("Detected NVIDIA/CUDA and AMD/ROCm: prioritizing NVIDIA/CUDA.") + set(USE_CUDA ON) + set(USE_ROCM OFF) + elseif(NVIDIA_FOUND) + message("Detected NVIDIA/CUDA.") + set(USE_CUDA ON) + set(USE_ROCM OFF) + elseif(AMD_FOUND) + message("Detected AMD/ROCm.") + set(USE_CUDA OFF) + set(USE_ROCM ON) + else() + message(FATAL_ERROR "Neither NVIDIA/CUDA nor AMD/ROCm is found.") + endif() +endif() + +# Declare project +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") +if(USE_CUDA) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra") + project(mscclpp LANGUAGES CXX CUDA) -# Find CUDAToolkit. Set CUDA flags based on the detected CUDA version -find_package(CUDAToolkit REQUIRED) -if(CUDAToolkit_FOUND) + # CUDA 11 or higher is required if(CUDAToolkit_VERSION_MAJOR LESS 11) message(FATAL_ERROR "CUDA 11 or higher is required but detected ${CUDAToolkit_VERSION}") endif() + # Set CUDA architectures if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11) set(CMAKE_CUDA_ARCHITECTURES 80) endif() + # Hopper architecture if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12) set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 90) endif() + + set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver) + set(GPU_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS}) +else() + set(CMAKE_HIP_STANDARD 17) + set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra") + project(mscclpp LANGUAGES CXX) + + set(CMAKE_HIP_ARCHITECTURES gfx90a gfx941 gfx942) + + set(GPU_LIBRARIES hip::device) + set(GPU_INCLUDE_DIRS ${hip_INCLUDE_DIRS}) endif() -set(CUDA_LIBRARIES CUDA::cudart CUDA::cuda_driver) - -# Find if nvidia_peermem is installed and loaded -if(NOT BYPASS_PEERMEM_CHECK) - execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem" - RESULT_VARIABLE lsmod_result - OUTPUT_VARIABLE lsmod_output) - if(NOT lsmod_result EQUAL 0) - message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.") - endif() -endif() + +# Format targets +include(${PROJECT_SOURCE_DIR}/cmake/AddFormatTargets.cmake) # Find ibverbs and libnuma find_package(IBVerbs REQUIRED) find_package(NUMA REQUIRED) - -# Find optional packages -if(ALLOW_GDRCOPY) - find_package(GDRCopy) -endif() +find_package(Threads REQUIRED) add_library(mscclpp_obj OBJECT) target_include_directories(mscclpp_obj - PRIVATE - ${CUDAToolkit_INCLUDE_DIRS} + SYSTEM PRIVATE + ${GPU_INCLUDE_DIRS} ${IBVERBS_INCLUDE_DIRS} - ${NUMA_INCLUDE_DIRS} - ${GDRCOPY_INCLUDE_DIRS}) -target_link_libraries(mscclpp_obj PRIVATE ${CUDA_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} ${GDRCOPY_LIBRARIES}) + ${NUMA_INCLUDE_DIRS}) +target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads) target_link_libraries(mscclpp_obj PUBLIC coverage_config) set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) +if(USE_CUDA) + target_compile_definitions(mscclpp_obj PRIVATE USE_CUDA) +elseif(USE_ROCM) + target_compile_definitions(mscclpp_obj PRIVATE USE_ROCM) +endif() if(ENABLE_TRACE) target_compile_definitions(mscclpp_obj PRIVATE ENABLE_TRACE) endif() if(USE_NPKIT) target_compile_definitions(mscclpp_obj PRIVATE ENABLE_NPKIT) endif() -if(ALLOW_GDRCOPY AND GDRCOPY_FOUND) - target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY) - target_link_libraries(mscclpp_obj PRIVATE MSCCLPP::gdrcopy) -endif() # libmscclpp add_library(mscclpp SHARED) @@ -108,15 +143,19 @@ set_target_properties(mscclpp_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVER add_subdirectory(include) add_subdirectory(src) +if("${INSTALL_PREFIX}" STREQUAL "") + set(INSTALL_PREFIX "./") +endif() + install(TARGETS mscclpp_obj - FILE_SET HEADERS DESTINATION include) + FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX}/include) install(TARGETS mscclpp - LIBRARY DESTINATION lib) + LIBRARY DESTINATION ${INSTALL_PREFIX}/lib) install(TARGETS mscclpp_static - ARCHIVE DESTINATION lib) + ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib) # Tests -if (BUILD_TESTS) +if(BUILD_TESTS) enable_testing() # Called here to allow ctest from the build directory add_subdirectory(test) endif() diff --git a/README.md b/README.md index 7f0112ec1..9796179d3 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,56 @@ # MSCCL++ -GPU-driven computation & communication stack. +[![Latest Release](https://img.shields.io/github/release/microsoft/mscclpp.svg)](https://github.com/microsoft/mscclpp/releases/latest) +[![License](https://img.shields.io/github/license/microsoft/mscclpp.svg)](LICENSE) +[![CodeQL](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml/badge.svg?branch=main)](https://github.com/microsoft/mscclpp/actions/workflows/codeql-analysis.yml) -See [Quick Start](docs/quickstart.md) to quickly get started. +| Pipelines | Build Status | +|--------------------------|-------------------| +| Unit Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-ut?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=4&branchName=main) | +| Integration Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-test?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=3&branchName=main) | + +*NOTE (Nov 2023): Azure pipelines for ROCm will be added soon.* -See the latest performance evaluation on Azure [NDmv4](docs/performance-ndmv4.md). +A GPU-driven communication stack for scalable AI applications. -Build our Doxygen document by running `doxygen` in [`docs/`](docs/) directory. Run `python3 -m http.server ` in `docs/doxygen/html/` directory to serve the generated HTML files. +See [Quick Start](docs/quickstart.md) to quickly get started. ## Overview -MSCCL++ is a development kit for implementing highly optimized distributed GPU applications, in terms of both inter-GPU communication and GPU computation. MSCCL++ is specially designed for developers who want to fine-tune inter-GPU communication of their applications at the GPU kernel level, without awareness of detailed communication mechanisms. The key underlying concept of MSCCL++ is GPU-driven execution, where both communication and computation tasks are initiated by GPU not by CPU. That is, the communication and computation interfaces of MSCCL++ are provided as device-side APIs (called inside a GPU kernel), while the host-side APIs of MSCCL++ are for bootstrapping, initial connection setups, or background host threads for inter-GPU DMA and RDMA (called proxies). By using MSCCL++, we expect: +MSCCL++ redefines inter-GPU communication interfaces, thereby delivering a highly efficient and customizable communication stack for distributed GPU applications. Its design is specifically tailored to accommodate diverse performance optimization scenarios often encountered in state-of-the-art AI applications. Figure below provides a high-level overview of MSCCL++ abstractions in CUDA, C, and Python. + +|
MSCCL++ Abstractions Overview | +|-------------------------------| +| MSCCL++ Abstractions | + +The followings highlight the key features of MSCCL++. + +* **Light-weight and multi-layer abstractions.** MSCCL++ provides communication abstractions at lowest level close to hardware and at the highest level close to application API. The lowest level of abstraction is ultra light weight which enables a user to implement logics of data movement for a collective operation such as AllReduce inside a GPU kernel extremely efficiently without worrying about memory ordering of different ops. The modularity of MSCCL++ enables a user to construct the building blocks of MSCCL++ in a high level abstraction in Python and feed them to a CUDA kernel in order to facilitate the user's productivity. + +* **1-sided 0-copy synchronous and asynchronous abstracts.** MSCCL++ provides fine-grained synchronous and asynchronous 0-copy 1-sided abstracts for communication primitives such as `put()`, `get()`, `signal()`, `flush()`, and `wait()`. The 1-sided abstractions allows a user to asynchronously `put()` their data on the remote GPU as soon as it is ready without requiring the remote side to issue any receive instruction. This enables users to easily implement flexible communication logics, such as overlapping communication with computation, or implementing customized collective communication algorithms without worrying about potential deadlocks. Additionally, the 0-copy capability enables MSCCL++ to directly transfer data between user's buffers without using intermediate internal buffers which saves GPU bandwidth and memory capacity. -* **Holistic Optimization for High GPU Utilization.** As both communication and computation are scheduled inside a GPU kernel at the same time, we can optimize end-to-end performance of distributed GPU applications from a global view. For example, we can minimize the GPU resource contention between communication and computation, which is known to often substantially degrade throughput of distributed deep learning applications. +* **Unified abstractions for different interconnection hardware.** MSCCL++ provides consistent abstractions regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink/xGMI or InfiniBand). This simplifies the code for inter-GPU communication, which is often complex due to memory ordering of GPU/CPU read/writes and therefore, is error-prone. -* **Fully Pipelined System to Reduce Overhead from the Control Plane.** We can eliminate control overhead from CPU by allowing GPU to autonomously schedule both communication and computation. This significantly reduces GPU scheduling overhead and CPU-GPU synchronization overhead. For example, this allows us to implement a highly fine-grained system pipelining (i.e., hiding communication delays by overlapping with computation), which has been difficult for CPU-controlled applications due to the large control/scheduling overhead. +## Performance -* **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime. +While the power of MSCCL++ is fully realized with application-specific optimization, it still delivers performance benefits even for collective communication operations. The following figures provide a comparison of the AllReduce throughput of MSCCL++ against NCCL 2.19.3. This benchmark was tested over two [Azure NDmv4 SKUs](https://learn.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series) (8 A100-80G GPUs per node). -## Key Features (v0.3) +The key motivation behind these results is scaling of inference for LLM models using tensor parallelism. LLM requests usually are executed in two phases: prompt processing and token sampling. The prompt processing uses a large batch size that is usually equal to a request context length and the corresponding AllReduce size is `len_context*dim_hidden*sizeof(fp16)`. For a context length of 2048 with a hidden dimension of 12288 (GPT-3 size), the AllReduce size is 48MB. The token sampling uses a smaller batch size which corresponds to concurrent user requests in the system and therefore, the AllReduce size is `batch_size*dim_hidden*sizeof(fp16)`. For a concurrency of 16 users, the AllReduce size is 384KB. As the figures below demonstrates, MSCCL++ provides significant speed up over NCCL which is crucial for efficiency of serving LLMs at large scale. -MSCCL++ v0.3 supports the following features. +|
Single-node AllReduce |
Two-node AllReduce | +|-------------------------------|----------------------------| +| MSCCL++ vs NCCL AllReduce (Single-node) | MSCCL++ vs NCCL AllReduce (Two-node) | -### In-Kernel Communication Interfaces +## Key Concepts -MSCCL++ provides inter-GPU communication interfaces to be called by a GPU thread. For example, the `put()` method in the following example copies 1KB data from the local GPU to a remote GPU. `channel` is a peer-to-peer communication channel between two GPUs, which consists of information on send/receive buffers. `channel` is initialized from the host side before the kernel execution. +The following highlights key concepts of MSCCL++. + +### On-GPU Communication Interfaces: Channels + +MSCCL++ provides peer-to-peer communication methods between GPUs. A peer-to-peer connection between two GPUs is called a *Channel*. Channels are constructed by MSCCL++ host-side interfaces and copied to GPUs during initialization. Channels provide *GPU-side interfaces*, which means that all communication methods are defined as a device function to be called from a GPU kernel code. For example, the `put()` method in the following example copies 1KB data from the local GPU to a remote GPU. ```cpp +// `ProxyChannel` will be explained in the following section. __device__ mscclpp::DeviceHandle channel; __global__ void gpuKernel() { ... @@ -53,11 +77,17 @@ __device__ void barrier() { } ``` -MSCCL++ provides consistent in-kernel interfaces, i.e., the above interfaces are used regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink or InfiniBand). +MSCCL++ provides consistent interfaces, i.e., the above interfaces are used regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink/xGMI or InfiniBand). + +### ProxyChannel and SmChannel + +MSCCL++ delivers two types of channels, **ProxyChannel** and **SmChannel**. `ProxyChannel` provides (R)DMA-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy (hence the name `ProxyChannel`), which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, `ProxyChannel` requires only a single GPU thread to call its methods. See all `ProxyChannel` methods from [here](./include/mscclpp/proxy_channel_device.hpp). + +On the other hand, `SmChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against `ProxyChannel`, `SmChannel` is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all `SmChannel` methods from [here](./include/mscclpp/sm_channel_device.hpp). ### Host-Side Communication Proxy -Some in-kernel communication interfaces of MSCCL++ send requests (called triggers) to a GPU-external helper that conducts key functionalities such as DMA or RDMA. This helper is called a proxy service or a proxy in short. MSCCL++ provides a default implementation of a proxy, which is a background host thread that busy polls triggers from GPUs and conducts functionalities accordingly. For example, the following is a typical host-side code for MSCCL++. +MSCCL++ provides a default implementation of a host-side proxy for ProxyChannels, which is a background host thread that busy polls triggers from GPUs and conducts functionalities accordingly. For example, the following is a typical host-side code for MSCCL++. ```cpp // Bootstrap: initialize control-plane connections between all ranks @@ -120,19 +150,9 @@ public: Customized proxies can be used for conducting a series of pre-defined data transfers within only a single trigger from GPU at runtime. This would be more efficient than sending a trigger for each data transfer one by one. -### Flexible Customization - -Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases. - -### New in MSCCL++ v0.3 (Latest Release) -* Updated interfaces -* Add Python bindings and interfaces -* Add Python unit tests -* Add more configurable parameters -* Add a new single-node AllReduce kernel -* Fix bugs +### Python Interfaces -See details from https://github.com/microsoft/mscclpp/issues/89. +MSCCL++ provides Python bindings and interfaces, which simplifies integration with Python applications. ## Contributing diff --git a/cmake/AddFormatTargets.cmake b/cmake/AddFormatTargets.cmake index 71c3ef4ab..b95ad447b 100644 --- a/cmake/AddFormatTargets.cmake +++ b/cmake/AddFormatTargets.cmake @@ -26,11 +26,11 @@ find_program(BLACK black) if (BLACK) message(STATUS "Found black: ${BLACK}") add_custom_target(check-format-py - COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml --check ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test + COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml --check ${PROJECT_SOURCE_DIR} ) add_dependencies(check-format check-format-py) add_custom_target(format-py - COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR}/python ${PROJECT_SOURCE_DIR}/test + COMMAND ${BLACK} --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR} ) add_dependencies(format format-py) else() diff --git a/cmake/CheckAmdGpu.cmake b/cmake/CheckAmdGpu.cmake new file mode 100644 index 000000000..3b26bfa5e --- /dev/null +++ b/cmake/CheckAmdGpu.cmake @@ -0,0 +1,25 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +set(AMD_FOUND "FALSE") + +set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}") +# Temporal fix for rocm5.6 +set(ENV{amd_comgr_DIR} "/opt/rocm/lib/cmake/amd_comgr") +set(ENV{AMDDeviceLibs_DIR} "/opt/rocm/lib/cmake/AMDDeviceLibs") + +find_package(hip QUIET) + +if(NOT hip_FOUND) + return() +endif() + +enable_language(HIP) + +set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_amd_gpu.hip") + +try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC}) + +if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0) + set(AMD_FOUND "TRUE") +endif() diff --git a/cmake/CheckNvidiaGpu.cmake b/cmake/CheckNvidiaGpu.cmake new file mode 100644 index 000000000..adc42ea00 --- /dev/null +++ b/cmake/CheckNvidiaGpu.cmake @@ -0,0 +1,36 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +set(NVIDIA_FOUND "FALSE") + +find_package(CUDAToolkit) + +if(NOT CUDAToolkit_FOUND) + return() +endif() + +set(CMAKE_CUDA_ARCHITECTURES "60") +if(NOT CMAKE_CUDA_COMPILER) + # In case the CUDA Toolkit directory is not in the PATH + find_program(CUDA_COMPILER + NAMES nvcc + PATHS ${CUDAToolkit_BIN_DIR}) + if(NOT CUDA_COMPILER) + message(WARNING "Could not find nvcc in ${CUDAToolkit_BIN_DIR}") + unset(CMAKE_CUDA_ARCHITECTURES) + return() + endif() + set(CMAKE_CUDA_COMPILER "${CUDA_COMPILER}") +endif() +enable_language(CUDA) + +set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_nvidia_gpu.cu") + +try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC}) + +if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0) + set(NVIDIA_FOUND "TRUE") +else() + unset(CMAKE_CUDA_ARCHITECTURES) + unset(CMAKE_CUDA_COMPILER) +endif() diff --git a/cmake/check_amd_gpu.hip b/cmake/check_amd_gpu.hip new file mode 100644 index 000000000..7537f7edc --- /dev/null +++ b/cmake/check_amd_gpu.hip @@ -0,0 +1,15 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include + +__global__ void kernel() {} + +int main() { + int cnt; + hipError_t err = hipGetDeviceCount(&cnt); + if (err != hipSuccess || cnt == 0) { + return 1; + } + return 0; +} diff --git a/cmake/check_nvidia_gpu.cu b/cmake/check_nvidia_gpu.cu new file mode 100644 index 000000000..672e70f28 --- /dev/null +++ b/cmake/check_nvidia_gpu.cu @@ -0,0 +1,15 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include + +__global__ void kernel() {} + +int main() { + int cnt; + cudaError_t err = cudaGetDeviceCount(&cnt); + if (err != cudaSuccess || cnt == 0) { + return 1; + } + return 0; +} diff --git a/docker/base-cuda12.1.dockerfile b/docker/base-cuda12.1.dockerfile deleted file mode 100644 index 5c5bcd602..000000000 --- a/docker/base-cuda12.1.dockerfile +++ /dev/null @@ -1,59 +0,0 @@ -FROM nvidia/cuda:12.1.1-devel-ubuntu20.04 - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp - -ENV DEBIAN_FRONTEND=noninteractive - -RUN rm -rf /opt/nvidia - -RUN apt-get clean && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - curl \ - git \ - libcap2 \ - libnuma-dev \ - openssh-client \ - openssh-server \ - python3-dev \ - python3-pip \ - python3-setuptools \ - python3-wheel \ - sudo \ - wget \ - && \ - apt-get autoremove && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* /tmp/* - -# Install OFED -ENV OFED_VERSION=5.2-2.2.3.0 -RUN cd /tmp && \ - wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ - tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ - MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ - rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* - -# Install OpenMPI -ENV OPENMPI_VERSION=4.1.5 -RUN cd /tmp && \ - export ompi_v_parsed="$(echo ${OPENMPI_VERSION} | sed -E 's/^([0-9]+)\.([0-9]+)\..*/\1.\2/')" && \ - wget -q https://download.open-mpi.org/release/open-mpi/v${ompi_v_parsed}/openmpi-${OPENMPI_VERSION}.tar.gz && \ - tar xzf openmpi-${OPENMPI_VERSION}.tar.gz && \ - cd openmpi-${OPENMPI_VERSION} && \ - ./configure --prefix=/usr/local/mpi && \ - make -j && \ - make install && \ - cd .. && \ - rm -rf /tmp/openmpi-${OPENMPI_VERSION}* - -ENV PATH="/usr/local/mpi/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}" - -RUN echo PATH="${PATH}" > /etc/environment && \ - echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment - -ENTRYPOINT [] diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile new file mode 100644 index 000000000..87d3f5c0d --- /dev/null +++ b/docker/base-dev-x.dockerfile @@ -0,0 +1,38 @@ +ARG BASE_IMAGE=ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 +FROM ${BASE_IMAGE} + +LABEL maintainer="MSCCL++" +LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + htop \ + lcov \ + vim \ + && \ + apt-get autoremove && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + +# Install cmake 3.26.4 +ENV CMAKE_VERSION="3.26.4" +ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ + CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" +RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ + tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \ + rm -rf ${CMAKE_HOME}.tar.gz +ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" + +# Install Python dependencies +ADD . /tmp/mscclpp +WORKDIR /tmp/mscclpp +ARG TARGET="cuda12.1" +RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \ + python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt + +# Set PATH +RUN echo PATH="${PATH}" > /etc/environment + +# Cleanup +RUN rm -rf /tmp/mscclpp +WORKDIR / diff --git a/docker/base-cuda11.8.dockerfile b/docker/base-x.dockerfile similarity index 87% rename from docker/base-cuda11.8.dockerfile rename to docker/base-x.dockerfile index 22e03443b..bf29f718a 100644 --- a/docker/base-cuda11.8.dockerfile +++ b/docker/base-x.dockerfile @@ -1,4 +1,5 @@ -FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 +ARG BASE_IMAGE=nvidia/cuda:12.1.1-devel-ubuntu20.04 +FROM ${BASE_IMAGE} LABEL maintainer="MSCCL++" LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp @@ -7,8 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive RUN rm -rf /opt/nvidia -RUN apt-get clean && \ - apt-get update && \ +RUN apt-get update && \ apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ @@ -50,10 +50,12 @@ RUN cd /tmp && \ cd .. && \ rm -rf /tmp/openmpi-${OPENMPI_VERSION}* +ARG EXTRA_LD_PATH=/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64 ENV PATH="/usr/local/mpi/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-11.8/lib64:${LD_LIBRARY_PATH}" + LD_LIBRARY_PATH="/usr/local/mpi/lib:${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}" RUN echo PATH="${PATH}" > /etc/environment && \ echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment ENTRYPOINT [] +WORKDIR / diff --git a/docker/build.sh b/docker/build.sh new file mode 100755 index 000000000..5b14bcc4c --- /dev/null +++ b/docker/build.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +set -e + +declare -A baseImageTable +baseImageTable=( + ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04" + ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04" + ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" +) + +declare -A extraLdPathTable +extraLdPathTable=( + ["cuda11.8"]="/usr/local/cuda-11.8/lib64" + ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" + ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64" +) + +GHCR="ghcr.io/microsoft/mscclpp/mscclpp" +TARGET=${1} + +print_usage() { + echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]" +} + +if [[ ! -v "baseImageTable[${TARGET}]" ]]; then + echo "Invalid target: ${TARGET}" + print_usage + exit 1 +fi +echo "Target: ${TARGET}" + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +cd ${SCRIPT_DIR}/.. + +docker build -t ${GHCR}:base-${TARGET} \ + -f docker/base-x.dockerfile \ + --build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \ + --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \ + --build-arg TARGET=${TARGET} . + +docker build -t ${GHCR}:base-dev-${TARGET} \ + -f docker/base-dev-x.dockerfile \ + --build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \ + --build-arg TARGET=${TARGET} . diff --git a/docker/dev-cuda11.8.dockerfile b/docker/dev-cuda11.8.dockerfile deleted file mode 100644 index 094772b06..000000000 --- a/docker/dev-cuda11.8.dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp - -ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \ - CMAKE_VERSION="3.26.4" - -ADD . ${MSCCLPP_SRC_DIR} -WORKDIR ${MSCCLPP_SRC_DIR} - -# Install cmake 3.26.4 -ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ - CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" -RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ - tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \ - rm -rf ${CMAKE_HOME}.tar.gz -ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" - -# Install pytest & dependencies -RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt - -# Set PATH -RUN echo PATH="${PATH}" > /etc/environment - -# Cleanup -WORKDIR / -RUN rm -rf ${MSCCLPP_SRC_DIR} diff --git a/docker/dev-cuda12.1.dockerfile b/docker/dev-cuda12.1.dockerfile deleted file mode 100644 index 70fe684c1..000000000 --- a/docker/dev-cuda12.1.dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp - -ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \ - CMAKE_VERSION="3.26.4" - -ADD . ${MSCCLPP_SRC_DIR} -WORKDIR ${MSCCLPP_SRC_DIR} - -# Install cmake 3.26.4 -ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ - CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" -RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ - tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local -ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" - -# Install pytest & dependencies -RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt - -# Set PATH -RUN echo PATH="${PATH}" > /etc/environment - -# Cleanup -WORKDIR / -RUN rm -rf ${MSCCLPP_SRC_DIR} diff --git a/docker/release-cuda11.8.dockerfile b/docker/release-cuda11.8.dockerfile deleted file mode 100644 index 67963c583..000000000 --- a/docker/release-cuda11.8.dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp - -ENV MSCCLPP_HOME="/usr/local/mscclpp" \ - MSCCLPP_SRC_DIR="/tmp/mscclpp" \ - CMAKE_VERSION="3.26.4" - -# Download cmake 3.26.4 -ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ - CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" -RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ - tar xzf ${CMAKE_HOME}.tar.gz -C /tmp - -# Install MSCCL++ -ADD . ${MSCCLPP_SRC_DIR} -WORKDIR ${MSCCLPP_SRC_DIR} -RUN rm -rf build && \ - mkdir build && \ - cd build && \ - ${CMAKE_HOME}/bin/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${MSCCLPP_HOME} .. && \ - make -j mscclpp && \ - make install/fast && \ - strip ${MSCCLPP_HOME}/lib/libmscclpp.so.[0-9]*.[0-9]*.[0-9]* - -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${MSCCLPP_HOME}/lib" -RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment - -# Cleanup -WORKDIR / -RUN rm -rf ${CMAKE_HOME}* ${MSCCLPP_SRC_DIR} diff --git a/docker/release-cuda12.1.dockerfile b/docker/release-cuda12.1.dockerfile deleted file mode 100644 index 7c1961121..000000000 --- a/docker/release-cuda12.1.dockerfile +++ /dev/null @@ -1,36 +0,0 @@ -FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp - -ENV MSCCLPP_HOME="/usr/local/mscclpp" \ - MSCCLPP_SRC_DIR="/tmp/mscclpp" \ - CMAKE_VERSION="3.26.4" - -# Download cmake 3.26.4 -ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ - CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" -RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ - tar xzf ${CMAKE_HOME}.tar.gz -C /tmp - -# Install MSCCL++ -ADD . ${MSCCLPP_SRC_DIR} -WORKDIR ${MSCCLPP_SRC_DIR} -RUN rm -rf build && \ - mkdir build && \ - cd build && \ - ${CMAKE_HOME}/bin/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${MSCCLPP_HOME} .. && \ - make -j mscclpp mscclpp_static && \ - make install/fast && \ - strip ${MSCCLPP_HOME}/lib/libmscclpp.so.[0-9]*.[0-9]*.[0-9]* - -# Install MSCCL++ Python bindings -WORKDIR ${MSCCLPP_SRC_DIR} -RUN python3.8 -m pip install . - -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${MSCCLPP_HOME}/lib" -RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment - -# Cleanup -WORKDIR / -RUN rm -rf ${CMAKE_HOME}* ${MSCCLPP_SRC_DIR} diff --git a/docs/.gitignore b/docs/.gitignore index 94f90d1e6..00d9344fb 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1 +1,3 @@ doxygen/ +_build/ +sphinx/ diff --git a/docs/Doxyfile b/docs/Doxyfile index 0fa68bf2a..b2d5528e7 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -2043,7 +2043,7 @@ MAN_LINKS = NO # captures the structure of the code including all documentation. # The default value is: NO. -GENERATE_XML = NO +GENERATE_XML = YES # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..d4bb2cbb9 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..2bb9c1efb --- /dev/null +++ b/docs/README.md @@ -0,0 +1,27 @@ +## How to build docs + +1. Install `doxygen`. + + ```bash + $ sudo apt-get install doxygen + ``` + +2. Install Python packages below. If you install them on the user's local, you need to include `~/.local/bin` to `$PATH` (to use `sphinx-build`). + + ```bash + $ sudo python3 -m pip install sphinx sphinx_rtd_theme breathe + ``` + +3. Create Doxygen documents. + + ```bash + $ doxygen + ``` + +4. Create Sphinx documents. + + ```bash + $ sphinx-build -b html -Dbreathe_projects.mscclpp=$PWD/doxygen/xml $PWD $PWD/sphinx + ``` + +5. Done. The HTML files will be on `sphinx/` directory. diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..2e6544fa1 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,29 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "mscclpp" +copyright = "2023, MSCCL++ Team" +author = "MSCCL++ Team" +release = "v0.4.2" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["breathe"] + +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# Breathe configuration +breathe_default_project = "mscclpp" + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] diff --git a/docs/figs/abstractions.png b/docs/figs/abstractions.png new file mode 100644 index 000000000..e6183aa91 Binary files /dev/null and b/docs/figs/abstractions.png differ diff --git a/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg b/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg new file mode 100644 index 000000000..9c483b986 Binary files /dev/null and b/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg differ diff --git a/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg b/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg new file mode 100644 index 000000000..6c8132565 Binary files /dev/null and b/docs/figs/mscclpp_vs_nccl_comparison_num_nodes_2.jpeg differ diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 000000000..ba060047c --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,26 @@ +.. MSCCL++ documentation master file, created by + sphinx-quickstart on Tue Sep 5 13:03:46 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to MSCCL++'s documentation! +=================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + +Docs +==== + +.. doxygennamespace:: mscclpp + :members: diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 000000000..32bb24529 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/performance-ndmv4.md b/docs/performance-ndmv4.md index 28e38b0e7..4187b3b0a 100644 --- a/docs/performance-ndmv4.md +++ b/docs/performance-ndmv4.md @@ -1,50 +1,3 @@ # NDmv4 Performance -All results from NDmv4. NCCL version 2.17.1+cuda11.8, reported in-place numbers. - -nccl-tests command example: -```bash -mpirun --bind-to numa -hostfile /mnt/hostfile --tag-output --allow-run-as-root -map-by ppr:8:node --bind-to numa -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x PATH -x LD_PRELOAD=/mnt/nccl/build/lib/libnccl.so -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/mnt/ndv4-topo.xml -x NCCL_DEBUG=WARN ./build/all_gather_perf -b 1K -e 1K -g 1 -c 1 -w 10 -n 10 -G 1 -``` - -mscclpp-tests command example: -```bash -mpirun -allow-run-as-root -map-by ppr:8:node -hostfile /mnt/hostfile ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1K -w 10 -n 10 -G 10 -k 0 -``` - -**NOTE:** NCCL AllGather leverages Ring algorithm instead of all-pairs alike algorithm, which greatly reduces inter-node transmission, causing significant higher performance. MSCCL++ should do something similar in the future - -### 1 node, 8 gpus/node -**Latency (us)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:| -| 1K | 12.53 | **16.96** | 9.34 | **7.76** / 21.06 / 28.50 | 157.91 / 143.21 / 447.0 | 326.4 | - -**BusBW (GB/s)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:----------------------------:|:-----------------:| -| 1G | 253.59 | **231.45** | 254.69 | 217.05 / 216.98 / 217.15 | 125.06 / **255.64** / 124.89 | 22.55 | - -### 2 nodes, 1 gpu/node -**Latency (us)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:--------------:|:------------------------------:|:--------------------------:|:-----------------:| -| 1K | 16.08 | **21.27** | 29.84 | 14.67 / 29.12 / 35.43 | 15.32 / **13.84** / 26.08 | - | - -**BusBW (GB/s)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:| -| 1G | 15.84 | **18.65** | 15.48 | 13.94 / 13.83 / 14.10 | **23.30** / 23.29 / 21.60 | - | - -### 2 nodes, 8 gpus/node -**Latency (us)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:| -| 1K | 33.74 | **35.85** | 49.75 | **22.55** / 39.33 / 56.93 | 159.14 / 230.52 / 462.7 | - | - -**BusBW (GB/s)** -| Message Size | NCCL AllGather | NCCL AllReduce | NCCL AllToAll | MSCCL AllToAll LL/LL128/Simple | MSCCL++ AllGather K0/K1/K2 | MSCCL++ AllReduce | -|:------------:|:--------------:|:--------------:|:-------------:|:------------------------------:|:--------------------------:|:-----------------:| -| 1G | 177.05 | **183.82** | 37.80 | 40.17 / 40.18 / 40.23 | 44.19 / 9.31 / **209.33** | - | -| 4G | 186.01 | **188.18** | 37.81 | - / - / - | 44.60 / - / **234.08** | - | - +TBU diff --git a/docs/quickstart.md b/docs/quickstart.md index 9ccf1b6f9..af1bbe5f3 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -8,8 +8,10 @@ * ND_H100_v5 * [NC_A100_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/nc-a100-v4-series) (TBD) * Non-Azure Systems - * NVIDIA A100 GPUs + CUDA >= 11.1.1 - * NVIDIA H100 GPUs + CUDA >= 12.0.0 + * NVIDIA A100 GPUs + CUDA >= 11.8 + * NVIDIA H100 GPUs + CUDA >= 12.0 + * AMD MI250X GPUs + ROCm >= 5.7 + * AMD MI300X GPUs + ROCm >= 5.7 * OS: tested over Ubuntu 18.04 and 20.04 * Libraries: [libnuma](https://github.com/numactl/numactl), MPI (optional) * Others @@ -25,10 +27,24 @@ CMake 3.25 or later is required. ```bash $ git clone https://github.com/microsoft/mscclpp.git $ mkdir -p mscclpp/build && cd mscclpp/build +``` + +For NVIDIA platforms, build MSCCL++ as follows. + +```bash +# For NVIDIA platforms $ cmake -DCMAKE_BUILD_TYPE=Release .. $ make -j ``` +For AMD platforms, use HIPCC instead of the default C++ compiler. Replace `/path/to/hipcc` from the command below into the your HIPCC path. + +```bash +# For AMD platforms +$ CXX=/path/to/hipcc cmake -DCMAKE_BUILD_TYPE=Release .. +$ make -j +``` + ## Install from Source (Libraries and Headers) ```bash @@ -54,6 +70,8 @@ Our base image installs all prerequisites for MSCCL++. $ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 ``` +See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp). + ## Unit Tests `unit_tests` require one GPU on the system. It only tests operation of basic components. @@ -76,37 +94,53 @@ To run `mp_unit_tests` with more than two nodes, you need to specify the `-ip_po $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.0.0.5:50000 ``` -## mscclpp-test +## Performance Benchmark + +### Python Benchmark -mscclpp-test is a set of performance benchmarks for MSCCL++. It requires MPI to be installed on the system, and the path should be provided via `MPI_HOME` environment variable to the CMake build system. +[Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system. + +```bash +# Choose either `requirements_cu11.txt` or `requirements_cu12.txt` according to your CUDA version. +$ python3 -m pip install -r ./python/requirements_cu12.txt +$ mpirun -tag-output -np 8 python3 ./python/benchmark/allreduce_bench.py +``` + +### C++ Benchmark (mscclpp-test) + +*NOTE: mscclpp-test will be retired soon and will be maintained only as an example of C++ implementation. If you want to get the latest performance numbers, please use the Python benchmark instead.* + +mscclpp-test is a set of C++ performance benchmarks. It requires MPI on the system, and the path should be provided via `MPI_HOME` environment variable to the CMake build system. ```bash $ MPI_HOME=/path/to/mpi cmake -DCMAKE_BUILD_TYPE=Release .. -$ make -j sendrecv_test_perf allgather_test_perf allreduce_test_perf alltoall_test_perf +$ make -j allgather_test_perf allreduce_test_perf ``` -For example, the following command runs the AllReduce benchmark with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. +For example, the following command runs the `allreduce5` algorithm with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. You can try different algorithms by changing the `-k 5` option to another value (e.g., `-k 3` runs `allreduce3`). Check all algorithms from the code: [allreduce_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allreduce_test.cu) and [allgather_test.cu](https://github.com/microsoft/mscclpp/blob/main/test/mscclpp-test/allgather_test.cu). ```bash -$ mpirun -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 4 +$ mpirun --bind-to numa -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 5 ``` +*NOTE: a few algorithms set a condition on the total data size, such as to be a multiple of 3. If the condition is unmet, the command will throw a regarding error.* + Check the help message for more details. ```bash $ ./test/mscclpp-test/allreduce_test_perf --help -USAGE: allreduce_test_perf - [-b,--minbytes ] - [-e,--maxbytes ] - [-i,--stepbytes ] - [-f,--stepfactor ] - [-n,--iters ] - [-w,--warmup_iters ] - [-c,--check <0/1>] - [-T,--timeout