[CUDA] Fix cuda 13 build (microsoft#26153)

tianleiwu · web-flow · commit 4de31529f8bd · 2025-09-25T15:51:12.000-07:00
Fix cuda 13 build errors and warnings. Related: microsoft#25936 I've verified the build in Linux and Windows using the following test settings: ### Build command line You may need change cuda_home and cudnn_home to your installation directories, also update CMAKE_CUDA_ARCHITECTURES according to your GPU. #### Linux Build ``` pip install cmake ninja packaging numpy sh build.sh --config Release --build_dir build/cuda13 --parallel --use_cuda \ --cuda_version 12.8 --cuda_home /nvida/cuda13.0/ \ --cudnn_home /nvida/cudnn9.12_cu13/ \ --build_wheel --skip_tests \ --cmake_generator Ninja \ --enable_cuda_nhwc_ops \ --use_binskim_compliant_compile_flags \ --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=90-real;90-virtual \ --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON \ --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON ``` #### Windows Build ``` IF "%VCToolsVersion%"=="" call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" build.bat --cmake_generator "Visual Studio 17 2022" --config Release --build_dir build\cuda13 --build_wheel ^ --parallel 4 --nvcc_threads 1 --build_shared_lib ^ --use_cuda --cuda_version "13.0" --cuda_home "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0" ^ --cudnn_home "D:\cudnn\9.13.0.50_cuda13" ^ --skip_tests ^ --use_binskim_compliant_compile_flags ^ --enable_cuda_nhwc_ops ^ --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=native" ^ --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON ^ --cmake_extra_defines FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER ``` The onnxruntime_test_all.exe is passed for RTX 5060 Ti GPU, so the binary can support blackwell GPU (CUDA_ARCHITECTURES=120) properly with CUDA 13.0: ``` [----------] Global test environment tear-down [==========] 1242 tests from 111 test suites ran. (83468 ms total) [ PASSED ] 1242 tests. ```
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -1192,7 +1192,7 @@ function(onnxruntime_configure_target target_name)
 
   # Keep BinSkim happy
   if(MSVC AND NOT onnxruntime_target_platform MATCHES "ARM")
-    target_link_options(${target_name} PRIVATE "/CETCOMPAT")
+    target_link_options(${target_name} PRIVATE "$<$<LINK_LANGUAGE:CXX,C>:/CETCOMPAT>" "$<$<LINK_LANGUAGE:CUDA>:-Xlinker=/CETCOMPAT>")
   endif()
 
 endfunction()
@@ -1421,7 +1421,6 @@ configure_file(onnxruntime_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime_c
 get_property(onnxruntime_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
 
 if (onnxruntime_USE_CUDA)
-  set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
   set(CMAKE_CUDA_STANDARD 17)
   if(onnxruntime_CUDA_HOME)
     file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME})
@@ -1441,6 +1440,14 @@ if (onnxruntime_USE_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch")
+
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+    if (UNIX)
+      # Suppress deprecation errors (e.g., long4 in CUDA 13)
+      add_compile_options(-Wno-deprecated-declarations)
+    endif()
+  endif()
+
   if (NOT WIN32)
     list(APPEND CUDA_NVCC_FLAGS --compiler-options -fPIC)
   endif()
diff --git a/cmake/external/cuda_configuration.cmake b/cmake/external/cuda_configuration.cmake
@@ -58,6 +58,19 @@ macro(setup_cuda_compiler)
   if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS CUDA_REQUIRED_VERSION)
     message(FATAL_ERROR "CUDA version ${CMAKE_CUDA_COMPILER_VERSION} must be at least ${CUDA_REQUIRED_VERSION}")
   endif()
+
+  # For CUDA 13+, explicitly set the compiler front-end to Clang to handle
+  # MSVC-specific pragmas correctly in device code.
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0 AND NOT DEFINED CMAKE_CUDA_COMPILER_FRONTEND_VARIANT)
+    message(STATUS "Setting CUDA compiler front-end to Clang by default for CUDA 13+.")
+    set(CMAKE_CUDA_COMPILER_FRONTEND_VARIANT "CLANG")
+  endif()
+
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+    set(CMAKE_CUDA_RUNTIME_LIBRARY "Hybrid")
+  else()
+    set(CMAKE_CUDA_RUNTIME_LIBRARY "Shared")
+  endif()
 endmacro()
 
 macro(setup_cuda_architectures)
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -191,6 +191,17 @@
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>")
     endif()
 
+    if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+      if (UNIX)
+        # Suppress -Wattributes warning from protobuf headers with nvcc on Linux
+        target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler -Wno-attributes>")
+      endif()
+
+      if (MSVC)
+          target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=20199>")
+      endif()
+    endif()
+
     if (UNIX)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler -Wno-reorder>"
                   "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-reorder>")
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -1523,7 +1523,7 @@ endif()
       list(APPEND onnxruntime_shared_lib_test_LIBS cpuinfo)
     endif()
     if (onnxruntime_USE_CUDA)
-      list(APPEND onnxruntime_shared_lib_test_LIBS)
+      list(APPEND onnxruntime_shared_lib_test_LIBS CUDA::cudart)
     endif()
 
     if (onnxruntime_USE_TENSORRT)
@@ -1751,6 +1751,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     if (HAS_QSPECTRE)
       list(APPEND custom_op_lib_option "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /Qspectre>")
     endif()
+    set(custom_op_lib_link ${custom_op_lib_link} CUDA::cudart)
   endif()
 
   file(GLOB custom_op_src ${custom_op_src_patterns})
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h
@@ -22,6 +22,12 @@ namespace cuda {
 
 constexpr int kCumulatedSequenceLengthCacheMaxBatchSize = 128;
 
+// longlong4 is deprecated in cuda 13.
+// LongLong4 is similar to longlong4_32a, except this is also visible in Host compiler (longlong4_32a is only visible to nvcc);
+typedef struct __align__(32) {
+  long long int x, y, z, w;
+} LongLong4;
+
 // A cache for cumulated sequence length. It will be initialized in the first request, then become read-only after that.
 struct CumulatedSequenceLengthCache {
   onnxruntime::IAllocatorUniquePtr<void> buffer;
@@ -144,14 +150,14 @@ Status PastPresentBufferShare(int batch_size, int num_heads, int qk_head_size, i
 template <typename T>
 Status LaunchStridedCopy(
     cudaStream_t stream,
-    const T* in, int4 in_shape, longlong4 in_strides, const int* in_seqlens_offset,  // coord (b,n,s,h)
-    T* out, longlong4 out_strides, const int* out_seqlens_offset,                    // coord (b,n,s,h)
+    const T* in, int4 in_shape, LongLong4 in_strides, const int* in_seqlens_offset,  // coord (b,n,s,h)
+    T* out, LongLong4 out_strides, const int* out_seqlens_offset,                    // coord (b,n,s,h)
     int max_threads_per_block);
 
 template <typename T>
 Status LaunchStridedCopy(cudaStream_t stream,
-                         const T* in, int4 in_shape, longlong4 in_strides,  // coord (b,n,s,h)
-                         T* out, longlong4 out_strides,                     // coord (b,n,s,h)
+                         const T* in, int4 in_shape, LongLong4 in_strides,  // coord (b,n,s,h)
+                         T* out, LongLong4 out_strides,                     // coord (b,n,s,h)
                          int max_threads_per_block);
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_strided_copy.cu b/onnxruntime/contrib_ops/cuda/bert/attention_strided_copy.cu
@@ -11,8 +11,8 @@ namespace contrib {
 namespace cuda {
 
 template <typename T>
-__global__ void StridedCopy(const T* in, const int H, longlong4 in_strides,  // coord (b,n,s,h)
-                            T* out, longlong4 out_strides,                   // coord (b,n,s,h)
+__global__ void StridedCopy(const T* in, const int H, LongLong4 in_strides,  // coord (b,n,s,h)
+                            T* out, LongLong4 out_strides,                   // coord (b,n,s,h)
                             const int32_t* in_seqlens_offset, const int32_t* out_seqlens_offset) {
   const int h = threadIdx.x;
   const int n = threadIdx.y;
@@ -30,8 +30,8 @@ __global__ void StridedCopy(const T* in, const int H, longlong4 in_strides,  //
 }
 
 template <typename T>
-__global__ void StridedCopyLarge(const T* in, const int H, longlong4 in_strides,  // coord (b,n,s,h)
-                                 T* out, longlong4 out_strides,                   // coord (b,n,s,h)
+__global__ void StridedCopyLarge(const T* in, const int H, LongLong4 in_strides,  // coord (b,n,s,h)
+                                 T* out, LongLong4 out_strides,                   // coord (b,n,s,h)
                                  const int* in_seqlens_offset, const int* out_seqlens_offset) {
   // Use when (H*)*num_heads > 1024
   int h = threadIdx.x;
@@ -77,7 +77,7 @@ struct ToByteType<16> {
 
 template <>
 struct ToByteType<32> {
-  using T = ulonglong4;
+  using T = LongLong4;
 };
 
 template <int NumBytes>
@@ -86,8 +86,8 @@ using ToBytes = typename ToByteType<NumBytes>::T;
 template <typename T>
 Status LaunchStridedCopy(
     cudaStream_t stream,
-    const T* in, int4 in_shape, longlong4 in_strides, const int* in_seqlens_offset,  // coord (b,n,s,h)
-    T* out, longlong4 out_strides, const int* out_seqlens_offset,                    // coord (b,n,s,h)
+    const T* in, int4 in_shape, LongLong4 in_strides, const int* in_seqlens_offset,  // coord (b,n,s,h)
+    T* out, LongLong4 out_strides, const int* out_seqlens_offset,                    // coord (b,n,s,h)
     int max_threads_per_block) {
   int batch_size = in_shape.x;
   int num_heads = in_shape.y;
@@ -157,8 +157,8 @@ Status LaunchStridedCopy(
 
 template <typename T>
 Status LaunchStridedCopy(cudaStream_t stream,
-                         const T* in, int4 in_shape, longlong4 in_strides,  // coord (b,n,s,h)
-                         T* out, longlong4 out_strides,                     // coord (b,n,s,h)
+                         const T* in, int4 in_shape, LongLong4 in_strides,  // coord (b,n,s,h)
+                         T* out, LongLong4 out_strides,                     // coord (b,n,s,h)
                          int max_threads_per_block) {
   const int* in_seqlens_offset = nullptr;
   const int* out_seqlens_offset = nullptr;
@@ -170,14 +170,14 @@ Status LaunchStridedCopy(cudaStream_t stream,
 
 template Status LaunchStridedCopy<float>(
     cudaStream_t stream,
-    const float* in, int4 in_shape, longlong4 in_strides,
-    float* out, longlong4 out_strides,
+    const float* in, int4 in_shape, LongLong4 in_strides,
+    float* out, LongLong4 out_strides,
     int max_threads_per_block);
 
 template Status LaunchStridedCopy<half>(
     cudaStream_t stream,
-    const half* in, int4 in_shape, longlong4 in_strides,
-    half* out, longlong4 out_strides,
+    const half* in, int4 in_shape, LongLong4 in_strides,
+    half* out, LongLong4 out_strides,
     int max_threads_per_block);
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h
@@ -31,12 +31,13 @@
 
 #pragma once
 
+#include "core/providers/cuda/curand_wrapper.h"
+
 #ifdef HAS_PYTORCH
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 #endif
 
-#include <curand_kernel.h>
 #include <cmath>
 #include <cinttypes>
 #include <vector>
diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp b/onnxruntime/contrib_ops/cuda/llm/cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -49,7 +49,6 @@
 #include "cute/atom/copy_traits_sm90_tma.hpp"
 #include "cute/atom/mma_atom.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
-#include "cute/tensor_predicate.hpp"
 #include "contrib_ops/cuda/llm/cutlass_extensions/interleaved_numeric_conversion.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
@@ -4,8 +4,9 @@
 #pragma once
 
 #include <stdint.h>
+#include "core/providers/cuda/curand_wrapper.h"
 #include <cuda_fp16.h>
-#include <curand_kernel.h>
+
 #include <cstdio>
 #include "contrib_ops/cpu/transformers/generation_shared.h"
 
diff --git a/onnxruntime/contrib_ops/rocm/bert/attention_impl.h b/onnxruntime/contrib_ops/rocm/bert/attention_impl.h
@@ -14,6 +14,10 @@ namespace onnxruntime {
 namespace contrib {
 namespace rocm {
 
+typedef struct __align__(32) {
+  long long int x, y, z, w;
+} LongLong4;
+
 size_t GetAttentionScratchSize(
     size_t element_size,
     int batch_size,
@@ -162,14 +166,14 @@ Status ClassifyAttentionMode(AttentionType type,
 template <typename T>
 Status LaunchStridedCopy(
     hipStream_t stream,
-    const T* in, int4 in_shape, longlong4 in_strides, const int* in_seqlens_offset,  // coord (b,n,s,h)
-    T* out, longlong4 out_strides, const int* out_seqlens_offset,                    // coord (b,n,s,h)
+    const T* in, int4 in_shape, LongLong4 in_strides, const int* in_seqlens_offset,  // coord (b,n,s,h)
+    T* out, LongLong4 out_strides, const int* out_seqlens_offset,                    // coord (b,n,s,h)
     int max_threads_per_block);
 
 template <typename T>
 Status LaunchStridedCopy(hipStream_t stream,
-                         const T* in, int4 in_shape, longlong4 in_strides,  // coord (b,n,s,h)
-                         T* out, longlong4 out_strides,                     // coord (b,n,s,h)
+                         const T* in, int4 in_shape, LongLong4 in_strides,  // coord (b,n,s,h)
+                         T* out, LongLong4 out_strides,                     // coord (b,n,s,h)
                          int max_threads_per_block);
 }  // namespace rocm
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_pipelines.cuh b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_pipelines.cuh
@@ -143,7 +143,7 @@ struct Strides {
                             int seqlen_dim,
                             int head_size_dim) {
     ORT_UNUSED_PARAMETER(batch_dim);
-    return Strides{longlong4{
+    return Strides{LongLong4{
         static_cast<int64_t>(num_head_dim) * seqlen_dim * head_size_dim,
         static_cast<int64_t>(seqlen_dim) * head_size_dim,
         static_cast<int64_t>(head_size_dim),
@@ -157,15 +157,15 @@ struct Strides {
                             int num_head_dim,
                             int head_size_dim) {
     ORT_UNUSED_PARAMETER(batch_dim);
-    return Strides{longlong4{
+    return Strides{LongLong4{
         static_cast<int64_t>(seqlen_dim) * num_head_dim * head_size_dim,
         static_cast<int64_t>(head_size_dim),
         static_cast<int64_t>(num_head_dim) * head_size_dim,
         static_cast<int64_t>(1),
     }};
   }
 
-  template <typename T = longlong4>
+  template <typename T = LongLong4>
   T ForBNSHCoord() const {
     using E = typename T::value_type;
     return T{static_cast<E>(strides_for_bnsh_coord.x),
@@ -174,7 +174,7 @@ struct Strides {
              static_cast<E>(strides_for_bnsh_coord.w)};
   }
 
-  template <typename T = longlong4>
+  template <typename T = LongLong4>
   T ForBSNHCoord() const {
     using E = typename T::value_type;
     return T{static_cast<E>(strides_for_bnsh_coord.x),
@@ -183,7 +183,7 @@ struct Strides {
              static_cast<E>(strides_for_bnsh_coord.w)};
   }
 
-  template <typename T = longlong4>
+  template <typename T = LongLong4>
   T ForBNHSCoord() const {
     using E = typename T::value_type;
     return T{static_cast<E>(strides_for_bnsh_coord.x),
@@ -198,7 +198,7 @@ struct Strides {
   }
 
   // store intermediate strides in the canonical (b,n,s,h) coordinate order
-  longlong4 strides_for_bnsh_coord;
+  LongLong4 strides_for_bnsh_coord;
 };
 
 template <typename HipT, typename T>
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h
@@ -178,7 +178,7 @@ struct TreeNodeElement {
 
   inline NODE_MODE_ORT mode() const { return NODE_MODE_ORT(flags & 0x1F); }
   inline bool is_not_leaf() const { return !(flags & NODE_MODE_ORT::LEAF); }
-  inline bool is_missing_track_true() const { return flags & MissingTrack::kTrue; }
+  inline bool is_missing_track_true() const { return static_cast<uint8_t>(flags) & static_cast<uint8_t>(MissingTrack::kTrue); }
 
 #if defined(_TREE_DEBUG)
   std::string str() const {
diff --git a/onnxruntime/core/providers/cuda/curand_wrapper.h b/onnxruntime/core/providers/cuda/curand_wrapper.h
@@ -0,0 +1,12 @@
+//
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#if defined(CUDA_VERSION) && CUDA_VERSION == 13000
+#define __NV_NO_VECTOR_DEPRECATION_DIAG 1
+#endif
+
+#include <curand_kernel.h>
diff --git a/onnxruntime/core/providers/cuda/fpgeneric.cu b/onnxruntime/core/providers/cuda/fpgeneric.cu
@@ -10,9 +10,8 @@
 */
 // NV_TODO: optimize speed -- pass things needed in, optimize kernel speed, add half2
 // NV_TODO: investigate cub support for half
-
+#include "core/providers/cuda/curand_wrapper.h"
 #include "core/providers/cuda/cu_inc/common.cuh"
-#include <curand_kernel.h>
 
 #define TRANS_TILE_DIM 32
 #define BLOCK_ROWS 8
diff --git a/onnxruntime/core/providers/cuda/generator/random_impl.cu b/onnxruntime/core/providers/cuda/generator/random_impl.cu
@@ -1,9 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/providers/cuda/curand_wrapper.h"
 #include "core/providers/cuda/generator/random_impl.h"
 
-#include <curand_kernel.h>
 #include <algorithm>
 #include "core/providers/cuda/cu_inc/common.cuh"
 
diff --git a/onnxruntime/core/providers/cuda/nn/dropout_impl.cu b/onnxruntime/core/providers/cuda/nn/dropout_impl.cu
@@ -15,10 +15,10 @@
  */
 
 /* Modifications Copyright (c) Microsoft. */
+#include "core/providers/cuda/curand_wrapper.h"
 
 #include "core/providers/cuda/nn/dropout_impl.h"
 
-#include <curand_kernel.h>
 #include <algorithm>
 #include "core/providers/cuda/cu_inc/bitmask.cuh"
 
diff --git a/orttraining/orttraining/training_ops/cuda/math/bias_softmax_dropout_impl.cu b/orttraining/orttraining/training_ops/cuda/math/bias_softmax_dropout_impl.cu
@@ -1,9 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
+#include "core/providers/cuda/curand_wrapper.h"
 #include "orttraining/training_ops/cuda/math/bias_softmax_dropout_impl.h"
 
-#include <curand_kernel.h>
 #include <algorithm>
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/math/softmax_warpwise_impl.cuh"