llnl · artv3 · Nov 25, 2025 · Nov 25, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/include/RAJA/pattern/launch/launch_context_policy.hpp b/include/RAJA/pattern/launch/launch_context_policy.hpp
@@ -0,0 +1,31 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing the core components of RAJA::launch
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_context_policy_HPP
+#define RAJA_pattern_context_policy_HPP
+
+namespace RAJA
+{
+
+class LaunchContextDefaultPolicy;
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE)
+class LaunchContextDim3Policy;
+#endif
+
+}  // namespace RAJA
+#endif
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
@@ -20,6 +20,7 @@
 
 #include "RAJA/config.hpp"
 #include "RAJA/internal/get_platform.hpp"
+#include "RAJA/pattern/launch/launch_context_policy.hpp"
 #include "RAJA/util/StaticLayout.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/plugins.hpp"
@@ -176,21 +177,20 @@ struct LaunchParams
   Threads apply(Threads const& a) { return (threads = a); }
 };
 
-class LaunchContext
+class LaunchContextBase
 {
 public:
   // Bump style allocator used to
   // get memory from the pool
   size_t shared_mem_offset;
-
   void* shared_mem_ptr;
 
 #if defined(RAJA_SYCL_ACTIVE)
   // SGS ODR issue
   mutable ::sycl::nd_item<3>* itm;
 #endif
 
-  RAJA_HOST_DEVICE LaunchContext()
+  RAJA_HOST_DEVICE LaunchContextBase()
       : shared_mem_offset(0),
         shared_mem_ptr(nullptr)
   {}
@@ -209,20 +209,6 @@ class LaunchContext
     return static_cast<T*>(mem_ptr);
   }
 
-  /*
-  //Odd dependecy with atomics is breaking CI builds
-  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t
-  z_stride=DIM-1, typename arg, typename... args> RAJA_HOST_DEVICE auto
-  getSharedMemoryView(size_t bytes, arg idx, args... idxs)
-  {
-    T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
-
-    shared_mem_offset += bytes*sizeof(T);
-    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx,
-  idxs...);
-  }
-  */
-
   RAJA_HOST_DEVICE void releaseSharedMemory()
   {
     // On the cpu/gpu we want to restart the count
@@ -243,6 +229,43 @@ class LaunchContext
   }
 };
 
+template<typename LaunchContextPolicy>
+class LaunchContextT;
+
+template<>
+class LaunchContextT<LaunchContextDefaultPolicy> : public LaunchContextBase
+{
+public:
+  static constexpr bool hasDim3 = false;
+
+  using LaunchContextBase::LaunchContextBase;
+};
+
+// Preserve backwards compatibility
+using LaunchContext = LaunchContextT<LaunchContextDefaultPolicy>;
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE)
+template<>
+class LaunchContextT<LaunchContextDim3Policy> : public LaunchContextBase
+{
+public:
+  static constexpr bool hasDim3 = true;
+
+  dim3 thread_id;
+  dim3 block_dim;
+
+  RAJA_HOST_DEVICE
+  LaunchContextT() : LaunchContextBase(), thread_id(), block_dim() {}
+
+  RAJA_HOST_DEVICE
+  LaunchContextT(dim3 thread, dim3 block)
+      : LaunchContextBase(),
+        thread_id(thread),
+        block_dim(block)
+  {}
+};
+#endif
+
 template<typename LAUNCH_POLICY>
 struct LaunchExecute;
 

diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
@@ -28,33 +28,42 @@
 namespace RAJA
 {
 
-template<typename BODY, typename ReduceParams>
+template<typename BODY, typename LaunchContextPolicy, typename ReduceParams>
 __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY
                                                  body_in,
                                              ReduceParams reduce_params)
 {
-  LaunchContext ctx;
-
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
   auto& body      = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
-  ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body(reduce_params, body, ctx);
+  if constexpr (LaunchContextT<LaunchContextPolicy>::hasDim3)
+  {
+    LaunchContextT<LaunchContextPolicy> ctx(threadIdx, blockDim);
+    ctx.shared_mem_ptr = raja_shmem_ptr;
+    RAJA::expt::invoke_body(reduce_params, body, ctx);
+  }
+  else
+  {
+    LaunchContextT<LaunchContextPolicy> ctx;
+    ctx.shared_mem_ptr = raja_shmem_ptr;
+    RAJA::expt::invoke_body(reduce_params, body, ctx);
+  }
 
   // Using a flatten global policy as we may use all dimensions
   RAJA::expt::ParamMultiplexer::parampack_combine(
       RAJA::cuda_flatten_global_xyz_direct {}, reduce_params);
 }
 
-template<bool async>
+template<bool async, typename LaunchContextPolicy>
 struct LaunchExecute<
     RAJA::policy::cuda::cuda_launch_explicit_t<async,
                                                named_usage::unspecified,
-                                               named_usage::unspecified>>
+                                               named_usage::unspecified,
+                                               LaunchContextPolicy>>
 {
 
   template<typename BODY_IN, typename ReduceParams>
@@ -72,7 +81,8 @@ struct LaunchExecute<
     EXEC_POL pol {};
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn<BODY, LaunchContextPolicy,
+                                      camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -137,32 +147,48 @@ struct LaunchExecute<
 template<typename BODY,
          int num_threads,
          size_t BLOCKS_PER_SM,
+         typename LaunchContextPolicy,
          typename ReduceParams>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
     void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY
                                                 body_in,
                                             ReduceParams reduce_params)
 {
-  LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
   auto& body      = privatizer.get_priv();
 
   // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
-  ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body(reduce_params, body, ctx);
+  if constexpr (LaunchContextT<LaunchContextPolicy>::hasDim3)
+  {
+    LaunchContextT<LaunchContextPolicy> ctx(threadIdx, blockDim);
+    ctx.shared_mem_ptr = raja_shmem_ptr;
+    RAJA::expt::invoke_body(reduce_params, body, ctx);
+  }
+  else
+  {
+    LaunchContextT<LaunchContextPolicy> ctx;
+    ctx.shared_mem_ptr = raja_shmem_ptr;
+    RAJA::expt::invoke_body(reduce_params, body, ctx);
+  }
 
   // Using a flatten global policy as we may use all dimensions
   RAJA::expt::ParamMultiplexer::parampack_combine(
       RAJA::cuda_flatten_global_xyz_direct {}, reduce_params);
 }
 
-template<bool async, int nthreads, size_t BLOCKS_PER_SM>
+template<bool async,
+         int nthreads,
+         size_t BLOCKS_PER_SM,
+         typename LaunchContextPolicy>
 struct LaunchExecute<
-    RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>
+    RAJA::policy::cuda::cuda_launch_explicit_t<async,
+                                               nthreads,
+                                               BLOCKS_PER_SM,
+                                               LaunchContextPolicy>>
 {
 
   template<typename BODY_IN, typename ReduceParams>
@@ -183,6 +209,7 @@ struct LaunchExecute<
 
     auto func = reinterpret_cast<const void*>(
         &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM,
+                                            LaunchContextPolicy,
                                             camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
@@ -245,6 +272,46 @@ struct LaunchExecute<
   }
 };
 
+/*
+  Loop methods which rely on a copy of threaIdx/BlockDim
+  for performance. In collaboration with AMD we have have this
+  to be more performat.
+*/
+
+namespace expt
+{
+
+template<named_dim DIM>
+struct cuda_ctx_thread_loop;
+
+using cuda_ctx_thread_loop_x = cuda_ctx_thread_loop<named_dim::x>;
+using cuda_ctx_thread_loop_y = cuda_ctx_thread_loop<named_dim::y>;
+using cuda_ctx_thread_loop_z = cuda_ctx_thread_loop<named_dim::z>;
+
+}  // namespace expt
+
+template<typename SEGMENT, named_dim DIM>
+struct LoopExecute<expt::cuda_ctx_thread_loop<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContextT<LaunchContextDim3Policy> const& ctx,
+      SEGMENT const& segment,
+      BODY const& body)
+  {
+
+    const int len         = segment.end() - segment.begin();
+    constexpr int int_dim = static_cast<int>(DIM);
+
+    for (int i = ::RAJA::internal::CudaDimHelper<DIM>::get(ctx.thread_id);
+         i < len; i += ::RAJA::internal::CudaDimHelper<DIM>::get(ctx.block_dim))
+    {
+      body(*(segment.begin() + i));
+    }
+  }
+};
+
 /*
    CUDA generic loop implementations
 */

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
@@ -26,6 +26,7 @@
 #include <utility>
 
 #include "RAJA/pattern/reduce.hpp"
+#include "RAJA/pattern/launch/launch_context_policy.hpp"
 
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
@@ -361,8 +362,9 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
 };
 
 template<bool Async,
-         int num_threads      = named_usage::unspecified,
-         size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+         int num_threads              = named_usage::unspecified,
+         size_t BLOCKS_PER_SM         = policy::cuda::MIN_BLOCKS_PER_SM,
+         typename LaunchContextPolicy = LaunchContextDefaultPolicy>
 struct cuda_launch_explicit_t
     : public RAJA::make_policy_pattern_launch_platform_t<
           RAJA::Policy::cuda,
@@ -1725,13 +1727,16 @@ using policy::cuda::cuda_synchronize;
 
 // policies usable with launch
 template<bool Async,
-         int num_threads      = named_usage::unspecified,
-         size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+         int num_threads              = named_usage::unspecified,
+         size_t BLOCKS_PER_SM         = policy::cuda::MIN_BLOCKS_PER_SM,
+         typename LaunchContextPolicy = LaunchContextDefaultPolicy>
 using cuda_launch_explicit_t =
     policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
 
 // CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
-template<bool Async, int num_threads = named_usage::unspecified>
+template<bool Async,
+         int num_threads              = named_usage::unspecified,
+         typename LaunchContextPolicy = LaunchContextDefaultPolicy>
 using cuda_launch_t =
     policy::cuda::cuda_launch_explicit_t<Async,
                                          num_threads,