llnl · johnbowen42 · Dec 29, 2025 · Dec 30, 2025 · Jan 8, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -328,6 +328,13 @@ install(FILES
 # Setup internal RAJA configuration options
 include(cmake/SetupRajaConfig.cmake)
 
+if (RAJA_ENABLE_JIT)
+  target_include_directories(RAJA 
+    PUBLIC
+    "${PROTEUS_INSTALL_DIR}/include"
+    )
+endif()
+
 if(RAJA_ENABLE_TESTS)
   add_subdirectory(test)
 endif()

diff --git a/cmake/SetupRajaOptions.cmake b/cmake/SetupRajaOptions.cmake
@@ -35,6 +35,7 @@ option(RAJA_ENABLE_RUNTIME_PLUGINS "Enable support for loading plugins at runtim
 option(RAJA_ALLOW_INCONSISTENT_OPTIONS "Enable inconsistent values for ENABLE_X and RAJA_ENABLE_X options" Off)
 
 option(RAJA_ENABLE_DESUL_ATOMICS "Enable support of desul atomics" Off)
+option(RAJA_ENABLE_JIT "Enable JIT compilation for RAJA kernels" Off)
 set(DESUL_ENABLE_TESTS Off CACHE BOOL "")
 
 set(TEST_DRIVER "" CACHE STRING "driver used to wrap test commands")
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
@@ -75,6 +75,10 @@
 #include "RAJA/policy/hip.hpp"
 #endif
 
+#if defined(RAJA_ENABLE_JIT)
+#include "proteus/JitInterface.hpp"
+#endif
+
 #if defined(RAJA_ENABLE_SYCL)
 #include "RAJA/policy/sycl.hpp"
 #endif

diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
@@ -182,6 +182,7 @@ static_assert(RAJA_HAS_SOME_CXX14,
 #cmakedefine RAJA_ENABLE_CLANG_CUDA
 #cmakedefine RAJA_ENABLE_HIP
 #cmakedefine RAJA_ENABLE_SYCL
+#cmakedefine RAJA_ENABLE_JIT
 
 #cmakedefine RAJA_ENABLE_OMP_TASK
 #cmakedefine RAJA_ALLOW_OPENMP_5_1_ATOMICS

diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
@@ -34,6 +34,7 @@
 
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/Jit.hpp"
 
 #include "RAJA/internal/fault_tolerance.hpp"
 
@@ -381,10 +382,11 @@ template<typename EXEC_POL,
                               (IterationGetter::block_size > 0),
                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forallp_cuda_kernel(const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
-                             const RAJA_CUDA_GRID_CONSTANT Iterator idx,
-                             const RAJA_CUDA_GRID_CONSTANT IndexType length,
-                             ForallParam f_params)
+    RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel(
+        const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
+        const RAJA_CUDA_GRID_CONSTANT Iterator idx,
+        const RAJA_CUDA_GRID_CONSTANT IndexType length,
+        ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
@@ -412,7 +414,7 @@ template<typename EXEC_POL,
                                           IterationMapping>::value &&
                               (IterationGetter::block_size <= 0),
                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_cuda_kernel(
+__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel(
     const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
     const RAJA_CUDA_GRID_CONSTANT Iterator idx,
     const RAJA_CUDA_GRID_CONSTANT IndexType length,
@@ -448,10 +450,11 @@ template<
                          (IterationGetter::block_size > 0),
                      size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-    void forallp_cuda_kernel(const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
-                             const RAJA_CUDA_GRID_CONSTANT Iterator idx,
-                             const RAJA_CUDA_GRID_CONSTANT IndexType length,
-                             ForallParam f_params)
+    RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel(
+        const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
+        const RAJA_CUDA_GRID_CONSTANT Iterator idx,
+        const RAJA_CUDA_GRID_CONSTANT IndexType length,
+        ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
@@ -482,7 +485,7 @@ template<
                                          IterationMapping>::value &&
                          (IterationGetter::block_size <= 0),
                      size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_cuda_kernel(
+__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel(
     const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
     const RAJA_CUDA_GRID_CONSTANT Iterator idx,
     const RAJA_CUDA_GRID_CONSTANT IndexType length,
@@ -554,7 +557,7 @@ forall_impl(resources::Cuda cuda_res,
   // Only launch kernel if we have something to iterate over
   if (len > 0)
   {
-
+    RAJA::register_lambda(loop_body);
     auto func = reinterpret_cast<const void*>(
         &impl::forallp_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
                                    IndexType, camp::decay<ForallParam>>);
@@ -642,6 +645,7 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
     LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
+  RAJA::register_lambda(loop_body);
   for (int isi = 0; isi < num_seg; ++isi)
   {
     iset.segmentCall(

diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -30,6 +30,7 @@
 
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/Jit.hpp"
 
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/For.hpp"
@@ -211,7 +212,8 @@ namespace internal
  * CUDA global function for launching CudaKernel policies
  */
 template<typename Data, typename Exec>
-__global__ void CudaKernelLauncher(const RAJA_CUDA_GRID_CONSTANT Data data)
+__global__ RAJA_JIT_COMPILE void CudaKernelLauncher(
+    const RAJA_CUDA_GRID_CONSTANT Data data)
 {
 
   using data_t        = camp::decay<Data>;
@@ -231,7 +233,7 @@ __global__ void CudaKernelLauncher(const RAJA_CUDA_GRID_CONSTANT Data data)
  * This launcher is used by the CudaKerelFixed policies.
  */
 template<int BlockSize, int BlocksPerSM, typename Data, typename Exec>
-__launch_bounds__(BlockSize, BlocksPerSM) __global__
+__launch_bounds__(BlockSize, BlocksPerSM) __global__ RAJA_JIT_COMPILE
     void CudaKernelLauncherFixed(const RAJA_CUDA_GRID_CONSTANT Data data)
 {
 
@@ -681,7 +683,7 @@ struct StatementExecutor<
         auto cuda_data = RAJA::cuda::make_launch_body(
             func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
             data);
-
+        RAJA::register_lambda(func);
         //
         // Launch the kernel
         //

diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
@@ -24,14 +24,15 @@
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 #include "RAJA/util/resource.hpp"
+#include "RAJA/util/Jit.hpp"
 
 namespace RAJA
 {
 
 template<typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY
-                                                 body_in,
-                                             ReduceParams reduce_params)
+__global__ RAJA_JIT_COMPILE void launch_new_reduce_global_fcn(
+    const RAJA_CUDA_GRID_CONSTANT BODY body_in,
+    ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -95,7 +96,7 @@ struct LaunchExecute<
     if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
     {
-
+      RAJA::register_lambda(body_in);
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
@@ -138,7 +139,7 @@ template<typename BODY,
          int num_threads,
          size_t BLOCKS_PER_SM,
          typename ReduceParams>
-__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
+__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ RAJA_JIT_COMPILE
     void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY
                                                 body_in,
                                             ReduceParams reduce_params)
@@ -206,7 +207,7 @@ struct LaunchExecute<
     if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
     {
-
+      RAJA::register_lambda(body_in);
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;

diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
@@ -35,6 +35,7 @@
 
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/Jit.hpp"
 
 #include "RAJA/internal/fault_tolerance.hpp"
 
@@ -374,10 +375,10 @@ template<typename EXEC_POL,
                               (IterationGetter::block_size > 0),
                           size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-    void forallp_hip_kernel(const LOOP_BODY loop_body,
-                            const Iterator idx,
-                            const IndexType length,
-                            ForallParam f_params)
+    RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel(const LOOP_BODY loop_body,
+                                                     const Iterator idx,
+                                                     const IndexType length,
+                                                     ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
@@ -403,10 +404,11 @@ template<typename EXEC_POL,
                                           IterationMapping>::value &&
                               (IterationGetter::block_size <= 0),
                           size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_hip_kernel(const LOOP_BODY loop_body,
-                                   const Iterator idx,
-                                   const IndexType length,
-                                   ForallParam f_params)
+__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel(
+    const LOOP_BODY loop_body,
+    const Iterator idx,
+    const IndexType length,
+    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
@@ -435,10 +437,10 @@ template<
                          (IterationGetter::block_size > 0),
                      size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-    void forallp_hip_kernel(const LOOP_BODY loop_body,
-                            const Iterator idx,
-                            const IndexType length,
-                            ForallParam f_params)
+    RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel(const LOOP_BODY loop_body,
+                                                     const Iterator idx,
+                                                     const IndexType length,
+                                                     ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
@@ -466,10 +468,11 @@ template<
                                          IterationMapping>::value &&
                          (IterationGetter::block_size <= 0),
                      size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__ void forallp_hip_kernel(const LOOP_BODY loop_body,
-                                   const Iterator idx,
-                                   const IndexType length,
-                                   ForallParam f_params)
+__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel(
+    const LOOP_BODY loop_body,
+    const Iterator idx,
+    const IndexType length,
+    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
@@ -536,7 +539,7 @@ forall_impl(resources::Hip hip_res,
   // Only launch kernel if we have something to iterate over
   if (len > 0)
   {
-
+    RAJA::register_lambda(loop_body);
     auto func = reinterpret_cast<const void*>(
         &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
                                   camp::decay<ForallParam>>);
@@ -620,6 +623,7 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
     const TypedIndexSet<SegmentTypes...>& iset,
     LoopBody&& loop_body)
 {
+  RAJA::register_lambda(loop_body);
   int num_seg = iset.getNumSegments();
   for (int isi = 0; isi < num_seg; ++isi)
   {

diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -30,6 +30,7 @@
 
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/Jit.hpp"
 
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/For.hpp"
@@ -179,7 +180,7 @@ namespace internal
  * HIP global function for launching HipKernel policies
  */
 template<typename Data, typename Exec>
-__global__ void HipKernelLauncher(const Data data)
+__global__ RAJA_JIT_COMPILE void HipKernelLauncher(const Data data)
 {
 
   using data_t        = camp::decay<Data>;
@@ -198,7 +199,7 @@ __global__ void HipKernelLauncher(const Data data)
  * This launcher is used by the HipKerelFixed policies.
  */
 template<int BlockSize, typename Data, typename Exec>
-__launch_bounds__(BlockSize, 1) __global__
+__launch_bounds__(BlockSize, 1) __global__ RAJA_JIT_COMPILE
     void HipKernelLauncherFixed(const Data data)
 {
 
@@ -639,7 +640,7 @@ struct StatementExecutor<
         auto hip_data = RAJA::hip::make_launch_body(
             func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
             data);
-
+        RAJA::register_lambda(func);
         //
         // Launch the kernel
         //

diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
@@ -24,13 +24,15 @@
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
 #include "RAJA/util/resource.hpp"
+#include "RAJA/util/Jit.hpp"
 
 namespace RAJA
 {
 
 template<typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(const BODY body_in,
-                                             ReduceParams reduce_params)
+__global__ RAJA_JIT_COMPILE void launch_new_reduce_global_fcn(
+    const BODY body_in,
+    ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
@@ -92,7 +94,7 @@ struct LaunchExecute<
     if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
     {
-
+      RAJA::register_lambda(body_in);
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
@@ -133,7 +135,7 @@ struct LaunchExecute<
 };
 
 template<typename BODY, int num_threads, typename ReduceParams>
-__launch_bounds__(num_threads, 1) __global__
+__launch_bounds__(num_threads, 1) __global__ RAJA_JIT_COMPILE
     void launch_new_reduce_global_fcn_fixed(const BODY body_in,
                                             ReduceParams reduce_params)
 {
@@ -210,7 +212,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
       launch_info.res          = hip_res;
 
       {
-
+        RAJA::register_lambda(body_in);
         RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers,
                                                      launch_info);
 

diff --git a/include/RAJA/util/Jit.hpp b/include/RAJA/util/Jit.hpp
@@ -0,0 +1,34 @@
+#include "RAJA/config.hpp"
+#if defined(RAJA_ENABLE_JIT)
+#include "proteus/JitInterface.hpp"
+#endif
+
+#ifndef RAJA_jit_HPP
+#define RAJA_jit_HPP
+
+namespace RAJA
+{
+template<typename Lambda>
+inline auto register_lambda(Lambda&& lambda)
+{
+#if defined RAJA_ENABLE_JIT
+  return proteus::register_lambda(std::forward<Lambda>(lambda));
+#else
+  return std::forward<Lambda>(lambda);
+#endif
+}
+
+template<typename T>
+inline auto jit_variable(T arg)
+{
+#if defined RAJA_ENABLE_JIT
+  return proteus::jit_variable(std::forward<T>(arg));
+#else
+  return std::forward<T>(arg);
+#endif
+}
+
+
+}  // namespace RAJA
+
+#endif