Add invoke function

stijnh · stijnh · commit ff3f808cbae8 · 2025-08-12T13:56:24.000+02:00
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
@@ -20,12 +20,12 @@ jobs:
     with:
       rocm-version: "6.3.0"
 
-  build-cuda-13-0:
+  build-cuda-12-7:
     if: github.ref_name == 'main'
     needs: build-cuda
     uses: ./.github/workflows/cmake-run-cuda.yml
     with:
-      cuda-version: "13.0.0"
+      cuda-version: "12.7.0"
 
   build-cuda-12-6:
     if: github.ref_name == 'main'
diff --git a/include/kernel_float/apply.h b/include/kernel_float/apply.h
@@ -116,6 +116,37 @@ broadcast_like(const V& input, const R& other) {
     return broadcast(input, vector_extent_type<R> {});
 }
 
+namespace detail {
+
+template<typename F, typename... Args>
+struct invoke_impl {
+    KERNEL_FLOAT_INLINE static decltype(auto) call(F fun, Args... args) {
+        return std::forward<F>(fun)(std::forward<Args>(args)...);
+    }
+};
+
+}  // namespace detail
+
+template<typename F, typename... Args>
+using result_t = decltype(detail::invoke_impl<decay_t<F>, decay_t<Args>...>::call(
+    detail::declval<F>(),
+    detail::declval<Args>()...));
+
+/**
+* Invoke the given function `fun` with the arguments `args...`.
+*
+* The main difference between directly calling `fun(args...)`, is that the behavior can be overridden by
+* specializing on `detail::invoke_impl<F, Args...>`.
+*
+* @return The result of `fun(args...)`.
+*/
+template<typename F, typename... Args>
+KERNEL_FLOAT_INLINE result_t<F, Args...> invoke(F fun, const Args&... args) {
+    return detail::invoke_impl<decay_t<F>, decay_t<Args>...>::call(
+        std::forward<F>(fun),
+        std::forward<Args>(args)...);
+}
+
 /**
  * The accurate_policy is designed for computations where maximum accuracy is essential. This policy ensures that all
  * operations are performed without any approximations or optimizations that could potentially alter the precise
@@ -168,13 +199,6 @@ using default_policy = accurate_policy;
 
 namespace detail {
 
-template<typename F, typename... Args>
-struct invoke_impl {
-    KERNEL_FLOAT_INLINE static auto call(F fun, Args... args) {
-        return fun(args...);
-    }
-};
-
 template<typename Policy, typename F, size_t N, typename Output, typename... Args>
 struct apply_impl;
 
@@ -190,7 +214,7 @@ struct apply_impl<accurate_policy, F, N, Output, Args...> {
     KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
 #pragma unroll
         for (size_t i = 0; i < N; i++) {
-            output[i] = invoke_impl<F, Args...>::call(fun, args[i]...);
+            output[i] = detail::invoke_impl<F, Args...>::call(fun, args[i]...);
         }
     }
 };
@@ -225,10 +249,6 @@ using default_map_impl = map_impl<default_policy, F, N, Output, Args...>;
 
 }  // namespace detail
 
-template<typename F, typename... Args>
-using result_t = decltype(
-    detail::invoke_impl<F, Args...>::call(detail::declval<F>(), detail::declval<Args>()...));
-
 template<typename F, typename... Args>
 using map_type =
     vector<result_t<F, vector_value_type<Args>...>, broadcast_vector_extent_type<Args...>>;
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
@@ -16,8 +16,8 @@
 
 //================================================================================
 // this file has been auto-generated, do not modify its contents!
-// date: 2025-08-12 09:36:07.217735
-// git hash: 15a92ee9e96aef3147fdcfc3dcb3bd4ce501d063
+// date: 2025-08-12 13:55:51.042675
+// git hash: 714ca6b5fd63ef3497d80ef018cb9a9460c91391
 //================================================================================
 
 #ifndef KERNEL_FLOAT_MACROS_H
@@ -41,8 +41,8 @@
     #endif  // __CUDA_ARCH__
 #elif defined(__HIPCC__)
     #define KERNEL_FLOAT_IS_HIP (1)
-    #define KERNEL_FLOAT_DEVICE  __attribute__((always_inline)) __device__
-    #define KERNEL_FLOAT_INLINE  __attribute__((always_inline)) __host__ __device__
+    #define KERNEL_FLOAT_DEVICE  __attribute__((always_inline)) inline __device__
+    #define KERNEL_FLOAT_INLINE  __attribute__((always_inline)) inline __host__ __device__
 
     #ifdef __HIP_DEVICE_COMPILE__
         #define KERNEL_FLOAT_IS_DEVICE (1)
@@ -781,6 +781,37 @@ broadcast_like(const V& input, const R& other) {
     return broadcast(input, vector_extent_type<R> {});
 }
 
+namespace detail {
+
+template<typename F, typename... Args>
+struct invoke_impl {
+    KERNEL_FLOAT_INLINE static decltype(auto) call(F fun, Args... args) {
+        return std::forward<F>(fun)(std::forward<Args>(args)...);
+    }
+};
+
+}  // namespace detail
+
+template<typename F, typename... Args>
+using result_t = decltype(detail::invoke_impl<decay_t<F>, decay_t<Args>...>::call(
+    detail::declval<F>(),
+    detail::declval<Args>()...));
+
+/**
+* Invoke the given function `fun` with the arguments `args...`.
+*
+* The main difference between directly calling `fun(args...)`, is that the behavior can be overridden by
+* specializing on `detail::invoke_impl<F, Args...>`.
+*
+* @return The result of `fun(args...)`.
+*/
+template<typename F, typename... Args>
+KERNEL_FLOAT_INLINE result_t<F, Args...> invoke(F fun, const Args&... args) {
+    return detail::invoke_impl<decay_t<F>, decay_t<Args>...>::call(
+        std::forward<F>(fun),
+        std::forward<Args>(args)...);
+}
+
 /**
  * The accurate_policy is designed for computations where maximum accuracy is essential. This policy ensures that all
  * operations are performed without any approximations or optimizations that could potentially alter the precise
@@ -833,13 +864,6 @@ using default_policy = accurate_policy;
 
 namespace detail {
 
-template<typename F, typename... Args>
-struct invoke_impl {
-    KERNEL_FLOAT_INLINE static auto call(F fun, Args... args) {
-        return fun(args...);
-    }
-};
-
 template<typename Policy, typename F, size_t N, typename Output, typename... Args>
 struct apply_impl;
 
@@ -855,7 +879,7 @@ struct apply_impl<accurate_policy, F, N, Output, Args...> {
     KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
 #pragma unroll
         for (size_t i = 0; i < N; i++) {
-            output[i] = invoke_impl<F, Args...>::call(fun, args[i]...);
+            output[i] = detail::invoke_impl<F, Args...>::call(fun, args[i]...);
         }
     }
 };
@@ -890,10 +914,6 @@ using default_map_impl = map_impl<default_policy, F, N, Output, Args...>;
 
 }  // namespace detail
 
-template<typename F, typename... Args>
-using result_t = decltype(
-    detail::invoke_impl<F, Args...>::call(detail::declval<F>(), detail::declval<Args>()...));
-
 template<typename F, typename... Args>
 using map_type =
     vector<result_t<F, vector_value_type<Args>...>, broadcast_vector_extent_type<Args...>>;
@@ -4407,14 +4427,12 @@ KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2)
 #elif KERNEL_FLOAT_IS_HIP
 KERNEL_FLOAT_INLINE __hip_bfloat16 hip_habs(const __hip_bfloat16 a) {
     unsigned short int res = __bfloat16_as_ushort(a);
-    res &= 0x7FFF;
-    return __ushort_as_bfloat16();
+    return __ushort_as_bfloat16(res & 0x7FFF);
 }
 
 KERNEL_FLOAT_INLINE __hip_bfloat16 hip_hneg(const __hip_bfloat16 a) {
     unsigned short int res = __bfloat16_as_ushort(a);
-    res ^= 0x8000;
-    return __ushort_as_bfloat16(res);
+    return __ushort_as_bfloat16(res ^ 0x8000);
 }
 
 KERNEL_FLOAT_INLINE __hip_bfloat162 hip_habs2(const __hip_bfloat162 a) {
diff --git a/tests/common.h b/tests/common.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <stdint.h>
 #include <math.h>
+#include <stdint.h>
 #include <tgmath.h>
 
 #include "catch2/catch_all.hpp"