diff --git a/libcudacxx/include/cuda/__numeric/add_overflow.h b/libcudacxx/include/cuda/__numeric/add_overflow.h
index da392c5cc4d..af5b46f3889 100644
--- a/libcudacxx/include/cuda/__numeric/add_overflow.h
+++ b/libcudacxx/include/cuda/__numeric/add_overflow.h
@@ -44,9 +44,15 @@
 
 #include <cuda/std/__cccl/prologue.h>
 
-#if _CCCL_CHECK_BUILTIN(__builtin_add_overflow) || _CCCL_COMPILER(GCC)
+#if _CCCL_CHECK_BUILTIN(builtin_add_overflow) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_ADD_OVERFLOW(...) __builtin_add_overflow(__VA_ARGS__)
-#endif // _CCCL_CHECK_BUILTIN(__builtin_add_overflow)
+#endif // _CCCL_CHECK_BUILTIN(builtin_add_overflow)
+
+// nvc++ doesn't support 128-bit integers and crashes when certain type combinations are used (nvbug 5730860), so let's
+// just disable the builtin for now.
+#if _CCCL_COMPILER(NVHPC)
+#  undef _CCCL_BUILTIN_ADD_OVERFLOW
+#endif // _CCCL_COMPILER(NVHPC)
 
 _CCCL_BEGIN_NAMESPACE_CUDA
 
@@ -197,14 +203,24 @@ _CCCL_REQUIRES((::cuda::std::is_void_v<_Result> || ::cuda::std::__cccl_is_intege
 [[nodiscard]]
 _CCCL_API constexpr overflow_result<_ActualResult> add_overflow(const _Lhs __lhs, const _Rhs __rhs) noexcept
 {
-// (1) __builtin_add_overflow is not available in a constant expression with gcc + nvcc
-// (2) __builtin_add_overflow generates suboptimal code with nvc++ and clang-cuda for device code
-#if defined(_CCCL_BUILTIN_ADD_OVERFLOW) && _CCCL_HOST_COMPILATION() \
-  && !(_CCCL_COMPILER(GCC) && _CCCL_CUDA_COMPILER(NVCC))
-  overflow_result<_ActualResult> __result;
-  __result.overflow = _CCCL_BUILTIN_ADD_OVERFLOW(__lhs, __rhs, &__result.value);
-  return __result;
-#else
+  // We want to use __builtin_add_overflow only in host code. When compiling CUDA source file, we cannot use it in
+  // constant expressions, because it doesn't work before nvcc 13.1 and is buggy in 13.1. When compiling C++ source
+  // file, we can use it all the time.
+#if defined(_CCCL_BUILTIN_ADD_OVERFLOW)
+#  if _CCCL_CUDA_COMPILATION()
+  _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+#  endif // _CCCL_CUDA_COMPILATION()
+  {
+    NV_IF_TARGET(NV_IS_HOST, ({
+                   overflow_result<_ActualResult> __result{};
+                   __result.overflow = _CCCL_BUILTIN_ADD_OVERFLOW(__lhs, __rhs, &__result.value);
+                   return __result;
+                 }))
+  }
+#endif // _CCCL_BUILTIN_ADD_OVERFLOW
+
+  // Host fallback + device implementation.
+#if _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_ADD_OVERFLOW)
   using ::cuda::std::__make_nbit_int_t;
   using ::cuda::std::__make_nbit_uint_t;
   using ::cuda::std::__num_bits_v;
@@ -294,7 +310,7 @@ _CCCL_API constexpr overflow_result<_ActualResult> add_overflow(const _Lhs __lhs
     }
     return overflow_result<_ActualResult>{static_cast<_ActualResult>(__sum), false}; // because of opposite signs
   }
-#endif // defined(_CCCL_BUILTIN_ADD_OVERFLOW) && !_CCCL_CUDA_COMPILER(NVCC)
+#endif // _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_ADD_OVERFLOW)
 }
 
 //! @brief Adds two numbers \p __lhs and \p __rhs with overflow detection
diff --git a/libcudacxx/include/cuda/__numeric/sub_overflow.h b/libcudacxx/include/cuda/__numeric/sub_overflow.h
index 1887cd71652..95fd158c302 100644
--- a/libcudacxx/include/cuda/__numeric/sub_overflow.h
+++ b/libcudacxx/include/cuda/__numeric/sub_overflow.h
@@ -42,9 +42,15 @@
 
 #include <cuda/std/__cccl/prologue.h>
 
-#if _CCCL_CHECK_BUILTIN(__builtin_sub_overflow) || _CCCL_COMPILER(GCC)
+#if _CCCL_CHECK_BUILTIN(builtin_sub_overflow) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_SUB_OVERFLOW(...) __builtin_sub_overflow(__VA_ARGS__)
-#endif // _CCCL_CHECK_BUILTIN(__builtin_sub_overflow)
+#endif // _CCCL_CHECK_BUILTIN(builtin_sub_overflow)
+
+// nvc++ doesn't support 128-bit integers and crashes when certain type combinations are used (nvbug 5730860), so let's
+// just disable the builtin for now.
+#if _CCCL_COMPILER(NVHPC)
+#  undef _CCCL_BUILTIN_SUB_OVERFLOW
+#endif // _CCCL_COMPILER(NVHPC)
 
 _CCCL_BEGIN_NAMESPACE_CUDA
 
@@ -213,14 +219,24 @@ _CCCL_REQUIRES((::cuda::std::is_void_v<_Result> || ::cuda::std::__cccl_is_intege
 [[nodiscard]]
 _CCCL_API constexpr overflow_result<_ActualResult> sub_overflow(const _Lhs __lhs, const _Rhs __rhs) noexcept
 {
-// (1) __builtin_sub_overflow is not available in a constant expression with gcc + nvcc
-// (2) __builtin_sub_overflow generates suboptimal code with nvc++ and clang-cuda for device code
-#if defined(_CCCL_BUILTIN_SUB_OVERFLOW) && _CCCL_HOST_COMPILATION() \
-  && !(_CCCL_COMPILER(GCC) && _CCCL_CUDA_COMPILER(NVCC))
-  overflow_result<_ActualResult> __result;
-  __result.overflow = _CCCL_BUILTIN_SUB_OVERFLOW(__lhs, __rhs, &__result.value);
-  return __result;
-#else
+  // We want to use __builtin_sub_overflow only in host code. When compiling CUDA source file, we cannot use it in
+  // constant expressions, because it doesn't work before nvcc 13.1 and is buggy in 13.1. When compiling C++ source
+  // file, we can use it all the time.
+#if defined(_CCCL_BUILTIN_SUB_OVERFLOW)
+#  if _CCCL_CUDA_COMPILATION()
+  _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+#  endif // _CCCL_CUDA_COMPILATION()
+  {
+    NV_IF_TARGET(NV_IS_HOST, ({
+                   overflow_result<_ActualResult> __result{};
+                   __result.overflow = _CCCL_BUILTIN_SUB_OVERFLOW(__lhs, __rhs, &__result.value);
+                   return __result;
+                 }))
+  }
+#endif // _CCCL_BUILTIN_SUB_OVERFLOW
+
+  // Host fallback + device implementation.
+#if _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_SUB_OVERFLOW)
   using ::cuda::std::common_type_t;
   using ::cuda::std::is_signed_v;
   using ::cuda::std::is_unsigned_v;
@@ -322,7 +338,7 @@ _CCCL_API constexpr overflow_result<_ActualResult> sub_overflow(const _Lhs __lhs
       return overflow_result<_ActualResult>{__sub_ret, __is_overflow};
     }
   }
-#endif // defined(_CCCL_BUILTIN_SUB_OVERFLOW) && !_CCCL_CUDA_COMPILER(NVCC)
+#endif // _CCCL_CUDA_COMPILATION() || !defined(_CCCL_BUILTIN_SUB_OVERFLOW)
 }
 
 //! @brief Subtracts two numbers \p __lhs and \p __rhs with overflow detection
diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
index fc4eab901a5..62ee10e17b2 100644
--- a/libcudacxx/include/cuda/std/__cccl/builtin.h
+++ b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -233,11 +233,11 @@
 #  undef _CCCL_BUILTIN_MEMMOVE
 #endif // _CCCL_CUDA_COMPILER(NVCC)
 
-#if _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete) \
+#if _CCCL_CHECK_BUILTIN(builtin_operator_new) && _CCCL_CHECK_BUILTIN(builtin_operator_delete) \
   && _CCCL_CUDA_COMPILER(CLANG)
 #  define _CCCL_BUILTIN_OPERATOR_DELETE(...) __builtin_operator_delete(__VA_ARGS__)
 #  define _CCCL_BUILTIN_OPERATOR_NEW(...)    __builtin_operator_new(__VA_ARGS__)
-#endif // _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete)
+#endif // _CCCL_CHECK_BUILTIN(builtin_operator_new) && _CCCL_CHECK_BUILTIN(builtin_operator_delete)
 
 #if _CCCL_CHECK_BUILTIN(builtin_prefetch) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_PREFETCH(...) NV_IF_TARGET(NV_IS_HOST, __builtin_prefetch(__VA_ARGS__);)
diff --git a/libcudacxx/test/libcudacxx/cuda/numeric/overflow.arithmetic/sub_overflow.pass.cpp b/libcudacxx/test/libcudacxx/cuda/numeric/overflow.arithmetic/sub_overflow.pass.cpp
index 4fbf0a2f068..465dfa52854 100644
--- a/libcudacxx/test/libcudacxx/cuda/numeric/overflow.arithmetic/sub_overflow.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/numeric/overflow.arithmetic/sub_overflow.pass.cpp
@@ -49,16 +49,14 @@ __host__ __device__ constexpr void test_type()
   using cuda::std::is_signed_v;
   using cuda::std::is_unsigned_v;
   static_assert(is_same_v<decltype(cuda::sub_overflow<Result>(Lhs{}, Rhs{})), cuda::overflow_result<Result>>);
-  using UResult                                  = cuda::std::make_unsigned_t<Result>;
-  using URhs                                     = cuda::std::make_unsigned_t<Rhs>;
   [[maybe_unused]] constexpr auto lhs_min        = cuda::std::numeric_limits<Lhs>::min();
   [[maybe_unused]] constexpr auto lhs_max        = cuda::std::numeric_limits<Lhs>::max();
   [[maybe_unused]] constexpr auto rhs_min        = cuda::std::numeric_limits<Rhs>::min();
   [[maybe_unused]] constexpr auto rhs_max        = cuda::std::numeric_limits<Rhs>::max();
   [[maybe_unused]] constexpr auto result_min     = cuda::std::numeric_limits<Result>::min();
   [[maybe_unused]] constexpr auto result_max     = cuda::std::numeric_limits<Result>::max();
-  [[maybe_unused]] constexpr auto neg_result_min = static_cast<UResult>(cuda::neg(result_min));
-  [[maybe_unused]] constexpr auto neg_rhs_min    = static_cast<URhs>(cuda::neg(rhs_min));
+  [[maybe_unused]] constexpr auto neg_result_min = cuda::uabs(result_min);
+  [[maybe_unused]] constexpr auto neg_rhs_min    = cuda::uabs(rhs_min);
   //--------------------------------------------------------------------------------------------------------------------
   // trivial cases
   // 1. 0 - 0 -> should never overflow