Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] add support for ROCm/HIP device #6086

Open
wants to merge 24 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -463,3 +463,6 @@ dask-worker-space/
*.pub
*.rdp
*_rsa

# hipify-perl -inplace leaves behind *.prehip files
*.prehip
80 changes: 53 additions & 27 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ option(USE_GPU "Enable GPU-accelerated training" OFF)
option(USE_SWIG "Enable SWIG to generate Java API" OFF)
option(USE_TIMETAG "Set to ON to output time costs" OFF)
option(USE_CUDA "Enable CUDA-accelerated training " OFF)
option(USE_ROCM "Enable ROCM-accelerated training " OFF)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
option(USE_ROCM "Enable ROCM-accelerated training " OFF)
option(USE_ROCM "Enable ROCm-accelerated training " OFF)

option(USE_DEBUG "Set to ON for Debug mode" OFF)
option(USE_SANITIZER "Use sanitizer flags" OFF)
set(
Expand Down Expand Up @@ -160,6 +161,11 @@ if(USE_CUDA)
set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
endif()

if(USE_ROCM)
enable_language(HIP)
set(USE_OPENMP ON CACHE BOOL "ROCM requires OpenMP" FORCE)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
set(USE_OPENMP ON CACHE BOOL "ROCM requires OpenMP" FORCE)
set(USE_OPENMP ON CACHE BOOL "ROCm requires OpenMP" FORCE)

endif()

if(USE_OPENMP)
if(APPLE)
find_package(OpenMP)
Expand Down Expand Up @@ -271,35 +277,53 @@ if(USE_CUDA)

message(STATUS "ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}")
message(STATUS "FULLDATA_DEFINES: ${FULLDATA_DEFINES}")
endif()

function(add_histogram hsize hname hadd hconst hdir)
add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu)
set_target_properties(
histo${hsize}${hname}
PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_ARCHITECTURES ${CUDA_ARCHS}
)
if(hadd)
list(APPEND histograms histo${hsize}${hname})
set(histograms ${histograms} PARENT_SCOPE)
endif()
target_compile_definitions(
histo${hsize}${hname}
PRIVATE
-DCONST_HESSIAN=${hconst}
${hdir}
)
endfunction()
if(USE_ROCM)
find_package(HIP)
include_directories(${HIP_INCLUDE_DIRS})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

@StrikerRUS StrikerRUS Dec 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also set HIP_ARCHITECTURES?

For NVIDIA, are they reused from CUDA_ARCHITECTURES?

set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${OpenMP_CXX_FLAGS} -fPIC -Wall")

# avoid warning: unused variable 'mask' due to __shfl_down_sync work-around
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-unused-variable")
# avoid warning: 'hipHostAlloc' is deprecated: use hipHostMalloc instead
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-deprecated-declarations")
# avoid many warnings about missing overrides
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-inconsistent-missing-override")
# avoid warning: shift count >= width of type in feature_histogram.hpp
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-shift-count-overflow")

foreach(hsize _16_64_256)
add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}")
add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}")
add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}")
add_histogram("${hsize}" "-allfeats_sp" "False" "0" "${ALLFEATS_DEFINES}")
add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}")
add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}")
endforeach()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DISABLED_WARNINGS}")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${DISABLED_WARNINGS}")

if(USE_DEBUG)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -g -O0")
else()
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -O3")
endif()
message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")

add_definitions(-DUSE_CUDA)

set(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not used. See #6766 (review).

BASE_DEFINES
-DPOWER_FEATURE_WORKGROUPS=12
-DUSE_CONSTANT_BUF=0
)
set(
ALLFEATS_DEFINES
${BASE_DEFINES}
-DENABLE_ALL_FEATURES
)
set(
FULLDATA_DEFINES
${ALLFEATS_DEFINES}
-DIGNORE_INDICES
)

message(STATUS "ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}")
message(STATUS "FULLDATA_DEFINES: ${FULLDATA_DEFINES}")
endif()

include(CheckCXXSourceCompiles)
Expand Down Expand Up @@ -634,7 +658,9 @@ if(USE_CUDA)
CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
endif()
endif()

if(USE_ROCM OR USE_CUDA)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not used. See #6766 (review).

# histograms are list of object libraries. Linking object library to other
# object libraries only gets usage requirements, the linked objects won't be
# used. Thus we have to call target_link_libraries on final targets here.
Expand Down
50 changes: 46 additions & 4 deletions include/LightGBM/cuda/cuda_algorithms.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
* Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/

#ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
Expand All @@ -14,6 +15,7 @@

#include <LightGBM/bin.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/cuda/cuda_rocm_interop.h>
#include <LightGBM/utils/log.h>

#include <algorithm>
Expand Down Expand Up @@ -174,7 +176,7 @@ __device__ __forceinline__ void GlobalMemoryPrefixSum(T* array, const size_t len
for (size_t index = start; index < end; ++index) {
thread_sum += array[index];
}
__shared__ T shared_mem[32];
__shared__ T shared_mem[WARPSIZE];
const T thread_base = ShufflePrefixSumExclusive<T>(thread_sum, shared_mem);
if (start < end) {
array[start] += thread_base;
Expand Down Expand Up @@ -483,7 +485,7 @@ __device__ void ShuffleSortedPrefixSumDevice(const VAL_T* in_values,
const INDEX_T* sorted_indices,
REDUCE_VAL_T* out_values,
const INDEX_T num_data) {
__shared__ REDUCE_VAL_T shared_buffer[32];
__shared__ REDUCE_VAL_T shared_buffer[WARPSIZE];
const INDEX_T num_data_per_thread = (num_data + static_cast<INDEX_T>(blockDim.x) - 1) / static_cast<INDEX_T>(blockDim.x);
const INDEX_T start = num_data_per_thread * static_cast<INDEX_T>(threadIdx.x);
const INDEX_T end = min(start + num_data_per_thread, num_data);
Expand Down Expand Up @@ -572,8 +574,48 @@ __device__ VAL_T PercentileDevice(const VAL_T* values,
INDEX_T* indices,
REDUCE_WEIGHT_T* weights_prefix_sum,
const double alpha,
const INDEX_T len);

const INDEX_T len) {
if (len <= 1) {
return values[0];
}
if (!USE_WEIGHT) {
BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 2, 10>(values, indices, len);
const double float_pos = (1.0f - alpha) * len;
const INDEX_T pos = static_cast<INDEX_T>(float_pos);
if (pos < 1) {
return values[indices[0]];
} else if (pos >= len) {
return values[indices[len - 1]];
} else {
const double bias = float_pos - pos;
const VAL_T v1 = values[indices[pos - 1]];
const VAL_T v2 = values[indices[pos]];
return static_cast<VAL_T>(v1 - (v1 - v2) * bias);
}
} else {
BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 4, 9>(values, indices, len);
ShuffleSortedPrefixSumDevice<WEIGHT_T, REDUCE_WEIGHT_T, INDEX_T>(weights, indices, weights_prefix_sum, len);
const REDUCE_WEIGHT_T threshold = weights_prefix_sum[len - 1] * (1.0f - alpha);
__shared__ INDEX_T pos;
if (threadIdx.x == 0) {
pos = len;
}
__syncthreads();
for (INDEX_T index = static_cast<INDEX_T>(threadIdx.x); index < len; index += static_cast<INDEX_T>(blockDim.x)) {
if (weights_prefix_sum[index] > threshold && (index == 0 || weights_prefix_sum[index - 1] <= threshold)) {
pos = index;
}
}
__syncthreads();
pos = min(pos, len - 1);
if (pos == 0 || pos == len - 1) {
return values[pos];
}
const VAL_T v1 = values[indices[pos - 1]];
const VAL_T v2 = values[indices[pos]];
return static_cast<VAL_T>(v1 - (v1 - v2) * (threshold - weights_prefix_sum[pos - 1]) / (weights_prefix_sum[pos] - weights_prefix_sum[pos - 1]));
}
}

} // namespace LightGBM

Expand Down
20 changes: 20 additions & 0 deletions include/LightGBM/cuda/cuda_rocm_interop.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*!
* Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/
#ifdef USE_CUDA

#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
// ROCm doesn't have __shfl_down_sync, only __shfl_down without mask.
// Since mask is full 0xffffffff, we can use __shfl_down instead.
#define __shfl_down_sync(mask, val, offset) __shfl_down(val, offset)
#define __shfl_up_sync(mask, val, offset) __shfl_up(val, offset)
// ROCm warpSize is constexpr and is either 32 or 64 depending on gfx arch.
#define WARPSIZE warpSize
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should WARPSIZE be also used here?

__shared__ score_t shared_mem_buffer[32];

__shared__ score_t shared_mem_buffer[32];

// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd
#define atomicAdd_block atomicAdd
#else
// CUDA warpSize is not a constexpr, but always 32
#define WARPSIZE 32
#endif

#endif
11 changes: 6 additions & 5 deletions include/LightGBM/cuda/cuda_split_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for
* license information.
* Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/

#ifdef USE_CUDA
Expand Down Expand Up @@ -40,24 +41,24 @@ class CUDASplitInfo {
uint32_t* cat_threshold = nullptr;
int* cat_threshold_real = nullptr;

__device__ CUDASplitInfo() {
__host__ __device__ CUDASplitInfo() {
num_cat_threshold = 0;
cat_threshold = nullptr;
cat_threshold_real = nullptr;
}

__device__ ~CUDASplitInfo() {
__host__ __device__ ~CUDASplitInfo() {
if (num_cat_threshold > 0) {
if (cat_threshold != nullptr) {
cudaFree(cat_threshold);
CUDASUCCESS_OR_FATAL(cudaFree(cat_threshold));
}
if (cat_threshold_real != nullptr) {
cudaFree(cat_threshold_real);
CUDASUCCESS_OR_FATAL(cudaFree(cat_threshold_real));
}
}
}

__device__ CUDASplitInfo& operator=(const CUDASplitInfo& other) {
__host__ __device__ CUDASplitInfo& operator=(const CUDASplitInfo& other) {
is_valid = other.is_valid;
leaf_index = other.leaf_index;
gain = other.gain;
Expand Down
3 changes: 2 additions & 1 deletion include/LightGBM/cuda/vector_cudahost.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*!
* Copyright (c) 2020 IBM Corporation, Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
* Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/
#ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
#define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
Expand Down Expand Up @@ -45,7 +46,7 @@ struct CHAllocator {
n = SIZE_ALIGNED(n);
#ifdef USE_CUDA
if (LGBM_config_::current_device == lgbm_device_cuda) {
cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
cudaError_t ret = cudaHostAlloc(reinterpret_cast<void**>(&ptr), n*sizeof(T), cudaHostAllocPortable);
if (ret != cudaSuccess) {
Log::Warning("Defaulting to malloc in CHAllocator!!!");
ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
Expand Down
Loading
Loading