diff --git a/cmake/modules/RocmSetup.cmake b/cmake/modules/RocmSetup.cmake index 13404c0188..d86ee32271 100644 --- a/cmake/modules/RocmSetup.cmake +++ b/cmake/modules/RocmSetup.cmake @@ -27,6 +27,24 @@ if(FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_ROCM) -Wno-ignored-attributes -Wno-unused-result) + # is this hipify v2? + execute_process( + COMMAND "${Python_EXECUTABLE}" -c + "from torch.utils.hipify import __version__; print(__version__)" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + OUTPUT_VARIABLE _tempvar + RESULT_VARIABLE _resvar + ERROR_VARIABLE _errvar) + if(NOT "${_resvar}" EQUAL "0") + message(WARNING "Failed to execute Python (${Python_EXECUTABLE})\n" + "Result: ${_resvar}\n" + "Error: ${_errvar}\n") + endif() + string(FIND "${_tempvar}" "2" found_pos) + if(found_pos GREATER_EQUAL 0) + list(APPEND HIP_HCC_FLAGS -DHIPIFY_V2) + endif() + BLOCK_PRINT( "HIP found: ${HIP_FOUND}" "HIPCC compiler flags:" diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index d67deed27d..217de3f68e 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -200,7 +200,8 @@ if(FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_ROCM) ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/experimental/example - ${CMAKE_CURRENT_SOURCE_DIR}/experimental/gen_ai) + ${CMAKE_CURRENT_SOURCE_DIR}/experimental/gen_ai + ${CMAKE_CURRENT_SOURCE_DIR}/experimental/gen_ai/src/quantize/common/include/fbgemm_gpu/quantize) # HIPify all .CU and .CUH sources under the current directory (`/fbgemm_gpu`) # diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/bf16_grouped/bf16_grouped_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/bf16_grouped/bf16_grouped_gemm.hip index 7c0ae8c0ef..65c5160928 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/bf16_grouped/bf16_grouped_gemm.hip +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/bf16_grouped/bf16_grouped_gemm.hip @@ -23,6 +23,10 @@ #include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp" #include "kernels/bf16_grouped_kernel_manifest.h" +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream +#endif + namespace fbgemm_gpu { // Define useful types that are needed for various kernels. diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/bf16_grouped/kernels/bf16_grouped_common.h b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/bf16_grouped/kernels/bf16_grouped_common.h index 8e5b946f41..e314ca98f4 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/bf16_grouped/kernels/bf16_grouped_common.h +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/bf16_grouped/kernels/bf16_grouped_common.h @@ -7,10 +7,10 @@ */ #include -#ifdef USE_ROCM #include -#else -#include + +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream #endif #include "ck/ck.hpp" diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip index 5cd718e8cc..b25cff544e 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/ck_utility.hip @@ -17,6 +17,10 @@ #include #include +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream +#endif + #if defined(USE_ROCM) #include "ck/ck.hpp" diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip index 811461df1c..a930195cdf 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_blockwise_gemm.hip @@ -14,6 +14,10 @@ #include #include +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream +#endif + #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_common.h b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_common.h index 71b0c1637f..2fc64f7bb4 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_common.h +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_common.h @@ -9,10 +9,10 @@ #include #include -#ifdef USE_ROCM #include -#else -#include + +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream #endif #include "ck/ck.hpp" diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_batched/kernels/fp8_rowwise_batched_common.h b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_batched/kernels/fp8_rowwise_batched_common.h index c0ecb364e6..eca720ebd0 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_batched/kernels/fp8_rowwise_batched_common.h +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_batched/kernels/fp8_rowwise_batched_common.h @@ -7,10 +7,10 @@ */ #include -#ifdef USE_ROCM #include -#else -#include + +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream #endif #include "ck/ck.hpp" diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip index 646692949a..bf80c00337 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip @@ -14,6 +14,10 @@ #include #include +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream +#endif + #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp" #include "kernels/fp8_rowwise_grouped_kernel_manifest.h" diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped_common.h b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped_common.h index 3f9c1215da..2f6a6ac3f1 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped_common.h +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped_common.h @@ -7,10 +7,10 @@ */ #undef __HIP_NO_HALF_CONVERSIONS__ #include -#ifdef USE_ROCM #include -#else -#include + +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream #endif #include "ck/ck.hpp" diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_preshuffle/kernels/fp8_rowwise_preshuffle_common.h b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_preshuffle/kernels/fp8_rowwise_preshuffle_common.h index 75ea2a1045..3784db7e57 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_preshuffle/kernels/fp8_rowwise_preshuffle_common.h +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_preshuffle/kernels/fp8_rowwise_preshuffle_common.h @@ -9,10 +9,10 @@ #include #include -#ifdef USE_ROCM #include -#else -#include + +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream #endif #include "ck/ck.hpp" diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_tensorwise_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_tensorwise_gemm.hip index 3fe55a705b..b53e0df2aa 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_tensorwise_gemm.hip +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_tensorwise_gemm.hip @@ -14,6 +14,10 @@ #include #include +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream +#endif + #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/fused_moe_kernel.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/fused_moe_kernel.hip index 55356e7256..07a6cb42d5 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/fused_moe_kernel.hip +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/fused_moe_kernel.hip @@ -7,6 +7,10 @@ #include +#ifdef HIPIFY_V2 +#define getCurrentHIPStream getCurrentCUDAStream +#endif +# #include #include #include diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include/fbgemm_gpu/quantize/tuning_cache.hpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include/fbgemm_gpu/quantize/tuning_cache.hpp index d6f4938785..2b7012d14f 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include/fbgemm_gpu/quantize/tuning_cache.hpp +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include/fbgemm_gpu/quantize/tuning_cache.hpp @@ -14,26 +14,9 @@ #include -// In OSS hipification of the include is not working, so we hipify it manually. -#ifdef USE_ROCM -#include // @manual -#include // @manual -#include -#define GPUStream at::hip::HIPStreamMasqueradingAsCUDA -#define GPUStreamGuard at::hip::HIPStreamGuardMasqueradingAsCUDA -#define getStreamFromPool at::hip::getStreamFromPoolMasqueradingAsCUDA -#define gpuStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed -#define gpuEventDefault hipEventDefault -#else #include #include #include -#define GPUStream at::cuda::CUDAStream -#define GPUStreamGuard at::cuda::CUDAStreamGuard -#define getStreamFromPool at::cuda::getStreamFromPool -#define gpuStreamCaptureModeRelaxed cudaStreamCaptureModeRelaxed -#define gpuEventDefault cudaEventDefault -#endif #include @@ -232,8 +215,8 @@ class TuningCache final { at::cuda::CUDAGraph graph; { // CUDAGraph capture must happen on non-default stream - GPUStream stream = getStreamFromPool(true); - GPUStreamGuard streamGuard(stream); + at::cuda::CUDAStream stream = at::cuda::getStreamFromPool(true); + at::cuda::CUDAStreamGuard streamGuard(stream); // For flexibility, we use cudaStreamCaptureModeRelaxed. // - cudaStreamCaptureModeGlobal prevents other threads from calling @@ -242,7 +225,7 @@ class TuningCache final { // - cudaStreamCaptureModeThreadLocal prevents CCA from freeing memory. // Since CUDA graph is preferred for offline benchmark this should be // fine. - graph.capture_begin({0, 0}, gpuStreamCaptureModeRelaxed); + graph.capture_begin({0, 0}, cudaStreamCaptureModeRelaxed); for (int i = 0; i < num_iters; ++i) { kernel(std::forward(args)...); } @@ -296,8 +279,8 @@ class TuningCache final { constexpr static std::string_view FBGEMM_CACHE_DIR = ".fbgemm"; - at::cuda::CUDAEvent start_ = at::cuda::CUDAEvent(gpuEventDefault); - at::cuda::CUDAEvent stop_ = at::cuda::CUDAEvent(gpuEventDefault); + at::cuda::CUDAEvent start_ = at::cuda::CUDAEvent(cudaEventDefault); + at::cuda::CUDAEvent stop_ = at::cuda::CUDAEvent(cudaEventDefault); // If FBGEMM_AUTOTUNE_USE_CUDA_GRAPH is set, use CUDA graph for benchmarking. // CUDA graphs use a separate memory pool to do allocation in PyTorch