1616
1717// ================================================================================
1818// this file has been auto-generated, do not modify its contents!
19- // date: 2025-08-21 10:13:04.148230
20- // git hash: 4d0d49cad7962d3f9ba4f2a0abfa2faea3ec7efa
19+ // date: 2025-09-02 18:31:16.281730
20+ // git hash: 023bc75e8ec67145cdcb447c5fd9aa7d7f180cc6
2121// ================================================================================
2222
2323#ifndef KERNEL_FLOAT_MACROS_H
5959 #define KERNEL_FLOAT_FP16_AVAILABLE (1 )
6060#endif // KERNEL_FLOAT_FP16_AVAILABLE
6161
62+ #ifndef KERNEL_FLOAT_FP16_OPS_AVAILABLE
63+ #define KERNEL_FLOAT_FP16_OPS_AVAILABLE ((KERNEL_FLOAT_IS_CUDA && __CUDA_ARCH__ >= 530 ) || KERNEL_FLOAT_IS_HIP)
64+ #endif
65+
6266#ifndef KERNEL_FLOAT_BF16_AVAILABLE
6367 #define KERNEL_FLOAT_BF16_AVAILABLE (1 )
6468#endif // KERNEL_FLOAT_BF16_AVAILABLE
6569
70+ #ifndef KERNEL_FLOAT_BF16_OPS_AVAILABLE
71+ #define KERNEL_FLOAT_BF16_OPS_AVAILABLE ((KERNEL_FLOAT_IS_CUDA && __CUDA_ARCH__ >= 800 ) || KERNEL_FLOAT_IS_HIP)
72+ #endif
73+
6674#ifndef KERNEL_FLOAT_FP8_AVAILABLE
6775 #ifdef __CUDACC_VER_MAJOR__
6876 #define KERNEL_FLOAT_FP8_AVAILABLE (__CUDACC_VER_MAJOR__ >= 12 )
@@ -4171,6 +4179,7 @@ struct allow_float_fallback<half_t> {
41714179#define KERNEL_FLOAT_FP16_UNARY_FUN (NAME, FUN1, FUN2 )
41724180#endif
41734181
4182+ #if KERNEL_FLOAT_FP16_OPS_AVAILABLE
41744183KERNEL_FLOAT_FP16_UNARY_FUN (sin, hsin, h2sin)
41754184KERNEL_FLOAT_FP16_UNARY_FUN(cos, hcos, h2cos)
41764185
@@ -4191,6 +4200,7 @@ KERNEL_FLOAT_FP16_UNARY_FUN(ceil, hceil, h2ceil)
41914200KERNEL_FLOAT_FP16_UNARY_FUN(rint, hrint, h2rint)
41924201KERNEL_FLOAT_FP16_UNARY_FUN(trunc, htrunc, h2trunc)
41934202KERNEL_FLOAT_FP16_UNARY_FUN(negate, __hneg, __hneg2)
4203+ #endif // KERNEL_FLOAT_FP16_OPS_AVAILABLE
41944204
41954205#if KERNEL_FLOAT_IS_DEVICE
41964206#define KERNEL_FLOAT_FP16_BINARY_FUN (NAME, FUN1, FUN2 ) \
@@ -4217,10 +4227,11 @@ KERNEL_FLOAT_FP16_UNARY_FUN(negate, __hneg, __hneg2)
42174227#endif
42184228
42194229// There are not available in HIP
4230+ #if KERNEL_FLOAT_FP16_OPS_AVAILABLE
42204231#if KERNEL_FLOAT_IS_CUDA
42214232KERNEL_FLOAT_FP16_BINARY_FUN (min, __hmin, __hmin2)
42224233KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2)
4223- #endif
4234+ #endif // KERNEL_FLOAT_IS_CUDA
42244235
42254236KERNEL_FLOAT_FP16_BINARY_FUN (add, __hadd, __hadd2)
42264237KERNEL_FLOAT_FP16_BINARY_FUN(subtract, __hsub, __hsub2)
@@ -4233,7 +4244,9 @@ KERNEL_FLOAT_FP16_BINARY_FUN(less, __hlt, __hlt2)
42334244KERNEL_FLOAT_FP16_BINARY_FUN(less_equal, __hle, __hle2)
42344245KERNEL_FLOAT_FP16_BINARY_FUN(greater, __hgt, __hgt2)
42354246KERNEL_FLOAT_FP16_BINARY_FUN(greater_equal, __hge, __hgt2)
4247+ #endif // KERNEL_FLOAT_FP16_OPS_AVAILABLE
42364248
4249+ #if KERNEL_FLOAT_FP16_OPS_AVAILABLE
42374250#if KERNEL_FLOAT_IS_DEVICE
42384251namespace ops {
42394252template <>
@@ -4270,7 +4283,8 @@ struct apply_impl<accurate_policy, ops::fma<half_t>, 2, half_t, half_t, half_t,
42704283
42714284KERNEL_FLOAT_FAST_F32_MAP (KERNEL_FLOAT_FAST_FP16_DISPATCH)
42724285} // namespace detail
4273- #endif
4286+ #endif // KERNEL_FLOAT_IS_DEVICE
4287+ #endif // KERNEL_FLOAT_FP16_OPS_AVAILABLE
42744288
42754289#define KERNEL_FLOAT_FP16_CAST (T, TO_HALF, FROM_HALF ) \
42764290 namespace ops { \
@@ -4335,7 +4349,7 @@ KERNEL_FLOAT_VECTOR_ALIAS(half, half_t)
43354349
43364350} // namespace kernel_float
43374351
4338- #endif
4352+ #endif // KERNEL_FLOAT_FP16_AVAILABLE
43394353
43404354#endif // KERNEL_FLOAT_FP16_H
43414355#ifndef KERNEL_FLOAT_BF16_H
@@ -4369,10 +4383,6 @@ using bfloat16_t = __hip_bfloat16;
43694383using bfloat16x2_t = __hip_bfloat162;
43704384#endif
43714385
4372- #if KERNEL_FLOAT_IS_CUDA && __CUDA_ARCH__ >= 800
4373- #define KERNEL_FLOAT_BF16_OPS_SUPPORTED 1
4374- #endif
4375-
43764386template <>
43774387struct preferred_vector_size <bfloat16_t > {
43784388 static constexpr size_t value = 2 ;
@@ -4420,7 +4430,7 @@ struct allow_float_fallback<bfloat16_t> {
44204430 }; \
44214431 }
44224432
4423- #if KERNEL_FLOAT_BF16_OPS_SUPPORTED
4433+ #if KERNEL_FLOAT_BF16_OPS_AVAILABLE
44244434KERNEL_FLOAT_BF16_UNARY_FUN (sin, ::hsin, ::h2sin)
44254435KERNEL_FLOAT_BF16_UNARY_FUN(cos, ::hcos, ::h2cos)
44264436
@@ -4496,7 +4506,7 @@ KERNEL_FLOAT_BF16_UNARY_FUN(negate, hip_hneg, hip_hneg2)
44964506 }; \
44974507 }
44984508
4499- #if KERNEL_FLOAT_BF16_OPS_SUPPORTED
4509+ #if KERNEL_FLOAT_BF16_OPS_AVAILABLE
45004510KERNEL_FLOAT_BF16_BINARY_FUN (add, __hadd, __hadd2)
45014511KERNEL_FLOAT_BF16_BINARY_FUN(subtract, __hsub, __hsub2)
45024512KERNEL_FLOAT_BF16_BINARY_FUN(multiply, __hmul, __hmul2)
@@ -4512,7 +4522,7 @@ KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2)
45124522KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2)
45134523#endif
45144524
4515- #if KERNEL_FLOAT_BF16_OPS_SUPPORTED
4525+ #if KERNEL_FLOAT_BF16_OPS_AVAILABLE
45164526namespace ops {
45174527template <>
45184528struct fma <bfloat16_t > {
@@ -4583,7 +4593,7 @@ KERNEL_FLOAT_FAST_F32_MAP(KERNEL_FLOAT_FAST_BF16_DISPATCH)
45834593KERNEL_FLOAT_BF16_CAST (float , __float2bfloat16(input), __bfloat162float(input))
45844594KERNEL_FLOAT_BF16_CAST (double , __double2bfloat16(input), __bfloat162float(input))
45854595
4586- #if KERNEL_FLOAT_BF16_OPS_SUPPORTED
4596+ #if KERNEL_FLOAT_BF16_OPS_AVAILABLE
45874597// clang-format off
45884598// there are no official char casts. Instead, cast to int and then to char
45894599KERNEL_FLOAT_BF16_CAST (char , __int2bfloat16_rn(input), (char )__bfloat162int_rz(input));
@@ -4637,7 +4647,7 @@ struct promote_type<half_t, bfloat16_t> {
46374647} // namespace kernel_float
46384648
46394649#endif // KERNEL_FLOAT_FP16_AVAILABLE
4640- #endif
4650+ #endif // KERNEL_FLOAT_BF16_AVAILABLE
46414651
46424652#endif // KERNEL_FLOAT_BF16_H
46434653#pragma once
@@ -4728,7 +4738,7 @@ KERNEL_FLOAT_DEFINE_POLY(asin_poly, 3, 0.05167, -0.2057, 1.57)
47284738KERNEL_FLOAT_DEFINE_POLY (asin_poly, 4 , -0.02103 , 0.077 , -0.2129 , 1.57 )
47294739KERNEL_FLOAT_DEFINE_POLY (asin_poly, 5 , 0.009796 , -0.03772 , 0.0857 , -0.2142 , 1.57 )
47304740
4731- #if KERNEL_FLOAT_FP16_AVAILABLE
4741+ #if KERNEL_FLOAT_FP16_OPS_AVAILABLE
47324742KERNEL_FLOAT_DEVICE half2_t flipsign (half2_t input, half2_t sign) {
47334743 // Flip signbit of input when sign<0
47344744 uint32_t result;
@@ -4923,9 +4933,9 @@ KERNEL_FLOAT_DEVICE half2_t tanh(half2_t x) {
49234933 }
49244934}
49254935
4926- #endif // KERNEL_FLOAT_FP16_AVAILABLE
4936+ #endif // KERNEL_FLOAT_FP16_OPS_AVAILABLE
49274937
4928- #if KERNEL_FLOAT_BF16_OPS_SUPPORTED
4938+ #if KERNEL_FLOAT_BF16_OPS_AVAILABLE
49294939KERNEL_FLOAT_DEVICE bfloat16x2_t make_bfloat162 (bfloat16_t x) {
49304940 return {x, x};
49314941}
@@ -5005,7 +5015,7 @@ KERNEL_FLOAT_DEVICE bfloat16x2_t exp(bfloat16x2_t arg) {
50055015 transmute<bfloat16_t >(uint16_t (transmute<uint32_t >(a))),
50065016 transmute<bfloat16_t >(uint16_t (transmute<uint32_t >(b)))};
50075017}
5008- #endif
5018+ #endif // KERNEL_FLOAT_BF16_OPS_AVAILABLE
50095019} // namespace approx
50105020
50115021namespace detail {
@@ -5036,7 +5046,7 @@ struct apply_impl<approx_level_policy<Level>, F, 1, T, T> {
50365046 apply_impl<approx_level_policy<DEFAULT_LEVEL>, ops::FUN<T>, 2 , T, T> {}; \
50375047 }
50385048
5039- #if KERNEL_FLOAT_FP16_AVAILABLE
5049+ #if KERNEL_FLOAT_FP16_OPS_AVAILABLE
50405050KERNEL_FLOAT_DEFINE_APPROX_IMPL (half_t , sin, 4 )
50415051KERNEL_FLOAT_DEFINE_APPROX_IMPL(half_t , cos, 4 )
50425052KERNEL_FLOAT_DEFINE_APPROX_IMPL(half_t , rsqrt, 1 )
@@ -5048,7 +5058,7 @@ KERNEL_FLOAT_DEFINE_APPROX_IMPL(half_t, asin, 2)
50485058KERNEL_FLOAT_DEFINE_APPROX_IMPL(half_t , acos, 2 )
50495059#endif
50505060
5051- #if KERNEL_FLOAT_BF16_OPS_SUPPORTED
5061+ #if KERNEL_FLOAT_BF16_OPS_AVAILABLE
50525062KERNEL_FLOAT_DEFINE_APPROX_IMPL (bfloat16_t , cos, 4 )
50535063KERNEL_FLOAT_DEFINE_APPROX_IMPL(bfloat16_t , sin, 4 )
50545064KERNEL_FLOAT_DEFINE_APPROX_IMPL(bfloat16_t , rcp, 1 )
0 commit comments