diff --git a/libcudacxx/include/cuda/__numeric/add_overflow.h b/libcudacxx/include/cuda/__numeric/add_overflow.h index da392c5cc4d..af5b46f3889 100644 --- a/libcudacxx/include/cuda/__numeric/add_overflow.h +++ b/libcudacxx/include/cuda/__numeric/add_overflow.h @@ -44,9 +44,15 @@ #include -#if _CCCL_CHECK_BUILTIN(__builtin_add_overflow) || _CCCL_COMPILER(GCC) +#if _CCCL_CHECK_BUILTIN(builtin_add_overflow) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_ADD_OVERFLOW(...) __builtin_add_overflow(__VA_ARGS__) -#endif // _CCCL_CHECK_BUILTIN(__builtin_add_overflow) +#endif // _CCCL_CHECK_BUILTIN(builtin_add_overflow) + +// nvc++ doesn't support 128-bit integers and crashes when certain type combinations are used (nvbug 5730860), so let's +// just disable the builtin for now. +#if _CCCL_COMPILER(NVHPC) +# undef _CCCL_BUILTIN_ADD_OVERFLOW +#endif // _CCCL_COMPILER(NVHPC) _CCCL_BEGIN_NAMESPACE_CUDA @@ -197,14 +203,24 @@ _CCCL_REQUIRES((::cuda::std::is_void_v<_Result> || ::cuda::std::__cccl_is_intege [[nodiscard]] _CCCL_API constexpr overflow_result<_ActualResult> add_overflow(const _Lhs __lhs, const _Rhs __rhs) noexcept { -// (1) __builtin_add_overflow is not available in a constant expression with gcc + nvcc -// (2) __builtin_add_overflow generates suboptimal code with nvc++ and clang-cuda for device code -#if defined(_CCCL_BUILTIN_ADD_OVERFLOW) && _CCCL_HOST_COMPILATION() \ - && !(_CCCL_COMPILER(GCC) && _CCCL_CUDA_COMPILER(NVCC)) - overflow_result<_ActualResult> __result; - __result.overflow = _CCCL_BUILTIN_ADD_OVERFLOW(__lhs, __rhs, &__result.value); - return __result; -#else + // We want to use __builtin_add_overflow only in host code. When compiling CUDA source file, we cannot use it in + // constant expressions, because it doesn't work before nvcc 13.1 and is buggy in 13.1. When compiling C++ source + // file, we can use it all the time. +#if defined(_CCCL_BUILTIN_ADD_OVERFLOW) +# if _CCCL_CUDA_COMPILATION() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT +# endif // _CCCL_CUDA_COMPILATION() + { + NV_IF_TARGET(NV_IS_HOST, ({ + overflow_result<_ActualResult> __result{}; + __result.overflow = _CCCL_BUILTIN_ADD_OVERFLOW(__lhs, __rhs, &__result.value); + return __result; + })) + } +#endif // _CCCL_BUILTIN_ADD_OVERFLOW + + // Host fallback + device implementation. +#if _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_ADD_OVERFLOW) using ::cuda::std::__make_nbit_int_t; using ::cuda::std::__make_nbit_uint_t; using ::cuda::std::__num_bits_v; @@ -294,7 +310,7 @@ _CCCL_API constexpr overflow_result<_ActualResult> add_overflow(const _Lhs __lhs } return overflow_result<_ActualResult>{static_cast<_ActualResult>(__sum), false}; // because of opposite signs } -#endif // defined(_CCCL_BUILTIN_ADD_OVERFLOW) && !_CCCL_CUDA_COMPILER(NVCC) +#endif // _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_ADD_OVERFLOW) } //! @brief Adds two numbers \p __lhs and \p __rhs with overflow detection diff --git a/libcudacxx/include/cuda/__numeric/sub_overflow.h b/libcudacxx/include/cuda/__numeric/sub_overflow.h index 1887cd71652..95fd158c302 100644 --- a/libcudacxx/include/cuda/__numeric/sub_overflow.h +++ b/libcudacxx/include/cuda/__numeric/sub_overflow.h @@ -42,9 +42,15 @@ #include -#if _CCCL_CHECK_BUILTIN(__builtin_sub_overflow) || _CCCL_COMPILER(GCC) +#if _CCCL_CHECK_BUILTIN(builtin_sub_overflow) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_SUB_OVERFLOW(...) __builtin_sub_overflow(__VA_ARGS__) -#endif // _CCCL_CHECK_BUILTIN(__builtin_sub_overflow) +#endif // _CCCL_CHECK_BUILTIN(builtin_sub_overflow) + +// nvc++ doesn't support 128-bit integers and crashes when certain type combinations are used (nvbug 5730860), so let's +// just disable the builtin for now. +#if _CCCL_COMPILER(NVHPC) +# undef _CCCL_BUILTIN_SUB_OVERFLOW +#endif // _CCCL_COMPILER(NVHPC) _CCCL_BEGIN_NAMESPACE_CUDA @@ -213,14 +219,24 @@ _CCCL_REQUIRES((::cuda::std::is_void_v<_Result> || ::cuda::std::__cccl_is_intege [[nodiscard]] _CCCL_API constexpr overflow_result<_ActualResult> sub_overflow(const _Lhs __lhs, const _Rhs __rhs) noexcept { -// (1) __builtin_sub_overflow is not available in a constant expression with gcc + nvcc -// (2) __builtin_sub_overflow generates suboptimal code with nvc++ and clang-cuda for device code -#if defined(_CCCL_BUILTIN_SUB_OVERFLOW) && _CCCL_HOST_COMPILATION() \ - && !(_CCCL_COMPILER(GCC) && _CCCL_CUDA_COMPILER(NVCC)) - overflow_result<_ActualResult> __result; - __result.overflow = _CCCL_BUILTIN_SUB_OVERFLOW(__lhs, __rhs, &__result.value); - return __result; -#else + // We want to use __builtin_sub_overflow only in host code. When compiling CUDA source file, we cannot use it in + // constant expressions, because it doesn't work before nvcc 13.1 and is buggy in 13.1. When compiling C++ source + // file, we can use it all the time. +#if defined(_CCCL_BUILTIN_SUB_OVERFLOW) +# if _CCCL_CUDA_COMPILATION() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT +# endif // _CCCL_CUDA_COMPILATION() + { + NV_IF_TARGET(NV_IS_HOST, ({ + overflow_result<_ActualResult> __result{}; + __result.overflow = _CCCL_BUILTIN_SUB_OVERFLOW(__lhs, __rhs, &__result.value); + return __result; + })) + } +#endif // _CCCL_BUILTIN_SUB_OVERFLOW + + // Host fallback + device implementation. +#if _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_SUB_OVERFLOW) using ::cuda::std::common_type_t; using ::cuda::std::is_signed_v; using ::cuda::std::is_unsigned_v; @@ -322,7 +338,7 @@ _CCCL_API constexpr overflow_result<_ActualResult> sub_overflow(const _Lhs __lhs return overflow_result<_ActualResult>{__sub_ret, __is_overflow}; } } -#endif // defined(_CCCL_BUILTIN_SUB_OVERFLOW) && !_CCCL_CUDA_COMPILER(NVCC) +#endif // _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_SUB_OVERFLOW) } //! @brief Subtracts two numbers \p __lhs and \p __rhs with overflow detection diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h index fc4eab901a5..62ee10e17b2 100644 --- a/libcudacxx/include/cuda/std/__cccl/builtin.h +++ b/libcudacxx/include/cuda/std/__cccl/builtin.h @@ -233,11 +233,11 @@ # undef _CCCL_BUILTIN_MEMMOVE #endif // _CCCL_CUDA_COMPILER(NVCC) -#if _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete) \ +#if _CCCL_CHECK_BUILTIN(builtin_operator_new) && _CCCL_CHECK_BUILTIN(builtin_operator_delete) \ && _CCCL_CUDA_COMPILER(CLANG) # define _CCCL_BUILTIN_OPERATOR_DELETE(...) __builtin_operator_delete(__VA_ARGS__) # define _CCCL_BUILTIN_OPERATOR_NEW(...) __builtin_operator_new(__VA_ARGS__) -#endif // _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete) +#endif // _CCCL_CHECK_BUILTIN(builtin_operator_new) && _CCCL_CHECK_BUILTIN(builtin_operator_delete) #if _CCCL_CHECK_BUILTIN(builtin_prefetch) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_PREFETCH(...) NV_IF_TARGET(NV_IS_HOST, __builtin_prefetch(__VA_ARGS__);) diff --git a/libcudacxx/test/libcudacxx/cuda/numeric/overflow.arithmetic/sub_overflow.pass.cpp b/libcudacxx/test/libcudacxx/cuda/numeric/overflow.arithmetic/sub_overflow.pass.cpp index 4fbf0a2f068..465dfa52854 100644 --- a/libcudacxx/test/libcudacxx/cuda/numeric/overflow.arithmetic/sub_overflow.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/numeric/overflow.arithmetic/sub_overflow.pass.cpp @@ -49,16 +49,14 @@ __host__ __device__ constexpr void test_type() using cuda::std::is_signed_v; using cuda::std::is_unsigned_v; static_assert(is_same_v(Lhs{}, Rhs{})), cuda::overflow_result>); - using UResult = cuda::std::make_unsigned_t; - using URhs = cuda::std::make_unsigned_t; [[maybe_unused]] constexpr auto lhs_min = cuda::std::numeric_limits::min(); [[maybe_unused]] constexpr auto lhs_max = cuda::std::numeric_limits::max(); [[maybe_unused]] constexpr auto rhs_min = cuda::std::numeric_limits::min(); [[maybe_unused]] constexpr auto rhs_max = cuda::std::numeric_limits::max(); [[maybe_unused]] constexpr auto result_min = cuda::std::numeric_limits::min(); [[maybe_unused]] constexpr auto result_max = cuda::std::numeric_limits::max(); - [[maybe_unused]] constexpr auto neg_result_min = static_cast(cuda::neg(result_min)); - [[maybe_unused]] constexpr auto neg_rhs_min = static_cast(cuda::neg(rhs_min)); + [[maybe_unused]] constexpr auto neg_result_min = cuda::uabs(result_min); + [[maybe_unused]] constexpr auto neg_rhs_min = cuda::uabs(rhs_min); //-------------------------------------------------------------------------------------------------------------------- // trivial cases // 1. 0 - 0 -> should never overflow