Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 27 additions & 11 deletions libcudacxx/include/cuda/__numeric/add_overflow.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,15 @@

#include <cuda/std/__cccl/prologue.h>

#if _CCCL_CHECK_BUILTIN(__builtin_add_overflow) || _CCCL_COMPILER(GCC)
#if _CCCL_CHECK_BUILTIN(builtin_add_overflow) || _CCCL_COMPILER(GCC)
# define _CCCL_BUILTIN_ADD_OVERFLOW(...) __builtin_add_overflow(__VA_ARGS__)
#endif // _CCCL_CHECK_BUILTIN(__builtin_add_overflow)
#endif // _CCCL_CHECK_BUILTIN(builtin_add_overflow)
Comment on lines +47 to +49
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🙀


// nvc++ doesn't support 128-bit integers and crashes when certain type combinations are used (nvbug 5730860), so let's
// just disable the builtin for now.
#if _CCCL_COMPILER(NVHPC)
# undef _CCCL_BUILTIN_ADD_OVERFLOW
#endif // _CCCL_COMPILER(NVHPC)

_CCCL_BEGIN_NAMESPACE_CUDA

Expand Down Expand Up @@ -197,14 +203,24 @@ _CCCL_REQUIRES((::cuda::std::is_void_v<_Result> || ::cuda::std::__cccl_is_intege
[[nodiscard]]
_CCCL_API constexpr overflow_result<_ActualResult> add_overflow(const _Lhs __lhs, const _Rhs __rhs) noexcept
{
// (1) __builtin_add_overflow is not available in a constant expression with gcc + nvcc
// (2) __builtin_add_overflow generates suboptimal code with nvc++ and clang-cuda for device code
#if defined(_CCCL_BUILTIN_ADD_OVERFLOW) && _CCCL_HOST_COMPILATION() \
&& !(_CCCL_COMPILER(GCC) && _CCCL_CUDA_COMPILER(NVCC))
overflow_result<_ActualResult> __result;
__result.overflow = _CCCL_BUILTIN_ADD_OVERFLOW(__lhs, __rhs, &__result.value);
return __result;
#else
// We want to use __builtin_add_overflow only in host code. When compiling CUDA source file, we cannot use it in
// constant expressions, because it doesn't work before nvcc 13.1 and is buggy in 13.1. When compiling C++ source
// file, we can use it all the time.
#if defined(_CCCL_BUILTIN_ADD_OVERFLOW)
# if _CCCL_CUDA_COMPILATION()
_CCCL_IF_NOT_CONSTEVAL_DEFAULT
# endif // _CCCL_CUDA_COMPILATION()
{
NV_IF_TARGET(NV_IS_HOST, ({
overflow_result<_ActualResult> __result{};
__result.overflow = _CCCL_BUILTIN_ADD_OVERFLOW(__lhs, __rhs, &__result.value);
return __result;
}))
}
#endif // _CCCL_BUILTIN_ADD_OVERFLOW

// Host fallback + device implementation.
#if _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_ADD_OVERFLOW)
using ::cuda::std::__make_nbit_int_t;
using ::cuda::std::__make_nbit_uint_t;
using ::cuda::std::__num_bits_v;
Expand Down Expand Up @@ -294,7 +310,7 @@ _CCCL_API constexpr overflow_result<_ActualResult> add_overflow(const _Lhs __lhs
}
return overflow_result<_ActualResult>{static_cast<_ActualResult>(__sum), false}; // because of opposite signs
}
#endif // defined(_CCCL_BUILTIN_ADD_OVERFLOW) && !_CCCL_CUDA_COMPILER(NVCC)
#endif // _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_ADD_OVERFLOW)
}

//! @brief Adds two numbers \p __lhs and \p __rhs with overflow detection
Expand Down
38 changes: 27 additions & 11 deletions libcudacxx/include/cuda/__numeric/sub_overflow.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,15 @@

#include <cuda/std/__cccl/prologue.h>

#if _CCCL_CHECK_BUILTIN(__builtin_sub_overflow) || _CCCL_COMPILER(GCC)
#if _CCCL_CHECK_BUILTIN(builtin_sub_overflow) || _CCCL_COMPILER(GCC)
# define _CCCL_BUILTIN_SUB_OVERFLOW(...) __builtin_sub_overflow(__VA_ARGS__)
#endif // _CCCL_CHECK_BUILTIN(__builtin_sub_overflow)
#endif // _CCCL_CHECK_BUILTIN(builtin_sub_overflow)

// nvc++ doesn't support 128-bit integers and crashes when certain type combinations are used (nvbug 5730860), so let's
// just disable the builtin for now.
#if _CCCL_COMPILER(NVHPC)
# undef _CCCL_BUILTIN_SUB_OVERFLOW
#endif // _CCCL_COMPILER(NVHPC)

_CCCL_BEGIN_NAMESPACE_CUDA

Expand Down Expand Up @@ -213,14 +219,24 @@ _CCCL_REQUIRES((::cuda::std::is_void_v<_Result> || ::cuda::std::__cccl_is_intege
[[nodiscard]]
_CCCL_API constexpr overflow_result<_ActualResult> sub_overflow(const _Lhs __lhs, const _Rhs __rhs) noexcept
{
// (1) __builtin_sub_overflow is not available in a constant expression with gcc + nvcc
// (2) __builtin_sub_overflow generates suboptimal code with nvc++ and clang-cuda for device code
#if defined(_CCCL_BUILTIN_SUB_OVERFLOW) && _CCCL_HOST_COMPILATION() \
&& !(_CCCL_COMPILER(GCC) && _CCCL_CUDA_COMPILER(NVCC))
overflow_result<_ActualResult> __result;
__result.overflow = _CCCL_BUILTIN_SUB_OVERFLOW(__lhs, __rhs, &__result.value);
return __result;
#else
// We want to use __builtin_sub_overflow only in host code. When compiling CUDA source file, we cannot use it in
// constant expressions, because it doesn't work before nvcc 13.1 and is buggy in 13.1. When compiling C++ source
// file, we can use it all the time.
#if defined(_CCCL_BUILTIN_SUB_OVERFLOW)
# if _CCCL_CUDA_COMPILATION()
_CCCL_IF_NOT_CONSTEVAL_DEFAULT
# endif // _CCCL_CUDA_COMPILATION()
{
NV_IF_TARGET(NV_IS_HOST, ({
overflow_result<_ActualResult> __result{};
__result.overflow = _CCCL_BUILTIN_SUB_OVERFLOW(__lhs, __rhs, &__result.value);
return __result;
}))
}
#endif // _CCCL_BUILTIN_SUB_OVERFLOW

// Host fallback + device implementation.
#if _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_SUB_OVERFLOW)
using ::cuda::std::common_type_t;
using ::cuda::std::is_signed_v;
using ::cuda::std::is_unsigned_v;
Expand Down Expand Up @@ -322,7 +338,7 @@ _CCCL_API constexpr overflow_result<_ActualResult> sub_overflow(const _Lhs __lhs
return overflow_result<_ActualResult>{__sub_ret, __is_overflow};
}
}
#endif // defined(_CCCL_BUILTIN_SUB_OVERFLOW) && !_CCCL_CUDA_COMPILER(NVCC)
#endif // _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_SUB_OVERFLOW)
}

//! @brief Subtracts two numbers \p __lhs and \p __rhs with overflow detection
Expand Down
4 changes: 2 additions & 2 deletions libcudacxx/include/cuda/std/__cccl/builtin.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,11 +233,11 @@
# undef _CCCL_BUILTIN_MEMMOVE
#endif // _CCCL_CUDA_COMPILER(NVCC)

#if _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete) \
#if _CCCL_CHECK_BUILTIN(builtin_operator_new) && _CCCL_CHECK_BUILTIN(builtin_operator_delete) \
&& _CCCL_CUDA_COMPILER(CLANG)
# define _CCCL_BUILTIN_OPERATOR_DELETE(...) __builtin_operator_delete(__VA_ARGS__)
# define _CCCL_BUILTIN_OPERATOR_NEW(...) __builtin_operator_new(__VA_ARGS__)
#endif // _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete)
#endif // _CCCL_CHECK_BUILTIN(builtin_operator_new) && _CCCL_CHECK_BUILTIN(builtin_operator_delete)

#if _CCCL_CHECK_BUILTIN(builtin_prefetch) || _CCCL_COMPILER(GCC)
# define _CCCL_BUILTIN_PREFETCH(...) NV_IF_TARGET(NV_IS_HOST, __builtin_prefetch(__VA_ARGS__);)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,14 @@ __host__ __device__ constexpr void test_type()
using cuda::std::is_signed_v;
using cuda::std::is_unsigned_v;
static_assert(is_same_v<decltype(cuda::sub_overflow<Result>(Lhs{}, Rhs{})), cuda::overflow_result<Result>>);
using UResult = cuda::std::make_unsigned_t<Result>;
using URhs = cuda::std::make_unsigned_t<Rhs>;
[[maybe_unused]] constexpr auto lhs_min = cuda::std::numeric_limits<Lhs>::min();
[[maybe_unused]] constexpr auto lhs_max = cuda::std::numeric_limits<Lhs>::max();
[[maybe_unused]] constexpr auto rhs_min = cuda::std::numeric_limits<Rhs>::min();
[[maybe_unused]] constexpr auto rhs_max = cuda::std::numeric_limits<Rhs>::max();
[[maybe_unused]] constexpr auto result_min = cuda::std::numeric_limits<Result>::min();
[[maybe_unused]] constexpr auto result_max = cuda::std::numeric_limits<Result>::max();
[[maybe_unused]] constexpr auto neg_result_min = static_cast<UResult>(cuda::neg(result_min));
[[maybe_unused]] constexpr auto neg_rhs_min = static_cast<URhs>(cuda::neg(rhs_min));
[[maybe_unused]] constexpr auto neg_result_min = cuda::uabs(result_min);
[[maybe_unused]] constexpr auto neg_rhs_min = cuda::uabs(rhs_min);
//--------------------------------------------------------------------------------------------------------------------
// trivial cases
// 1. 0 - 0 -> should never overflow
Expand Down