llnl · artv3 · Nov 25, 2025 · Nov 25, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
@@ -176,25 +176,49 @@ struct LaunchParams
   Threads apply(Threads const& a) { return (threads = a); }
 };
 
-class LaunchContext
+template<bool StoreDim3 = false>
+class LaunchContextT
 {
 public:
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+  // If StoreDim3 is true, store by value; else, don't store
+  typename std::conditional<StoreDim3, dim3, void*>::type thread_id;
+  typename std::conditional<StoreDim3, dim3, void*>::type block_dim;
+#endif
+
   // Bump style allocator used to
   // get memory from the pool
   size_t shared_mem_offset;
-
   void* shared_mem_ptr;
 
 #if defined(RAJA_SYCL_ACTIVE)
   // SGS ODR issue
   mutable ::sycl::nd_item<3>* itm;
 #endif
 
-  RAJA_HOST_DEVICE LaunchContext()
+  RAJA_HOST_DEVICE LaunchContextT()
       : shared_mem_offset(0),
         shared_mem_ptr(nullptr)
   {}
 
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+  // Only enable this constructor if StoreDim3 is true
+  template<bool S = StoreDim3, typename std::enable_if<S, int>::type = 0>
+  RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_)
+      : shared_mem_offset(0),
+        shared_mem_ptr(nullptr),
+        thread_id(thread_id_),
+        block_dim(block_id_)
+  {}
+
+  // Only enable this constructor if StoreDim3 is false
+  template<bool S = StoreDim3, typename std::enable_if<!S, int>::type = 0>
+  RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_)
+      : shared_mem_offset(0),
+        shared_mem_ptr(nullptr)
+  {}
+#endif
+
   // TODO handle alignment
   template<typename T>
   RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
@@ -243,6 +267,9 @@ class LaunchContext
   }
 };
 
+// Preserve backwards compatibility
+using LaunchContext = LaunchContextT<false>;
+
 template<typename LAUNCH_POLICY>
 struct LaunchExecute;
 
@@ -479,6 +506,38 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
   LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment, body);
 }
 
+/*
+template<typename POLICY_LIST, typename SEGMENT, typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContext const& ctx, SEGMENT const&
+segment, BODY const& body)
+{
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::template exec<BODY>(ctx,
+segment, body);
+}
+*/
+
+/*
+template<typename POLICY_LIST, typename SEGMENT, typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContextT<true> const& ctx, SEGMENT
+const& segment, BODY const& body)
+{
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::template exec<BODY>(ctx,
+segment, body);
+}
+*/
+
+
+/*
+// Overload for other contexts
+template<typename POLICY_LIST, typename CONTEXT, typename SEGMENT, typename
+BODY> std::enable_if_t<!is_launch_context<CONTEXT>::value> loop(CONTEXT const&
+ctx, SEGMENT const& segment, BODY const& body)
+{
+    LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::template exec<BODY>(ctx,
+segment, body);
+}
+*/
+
 template<typename POLICY_LIST,
          typename CONTEXT,
          typename SEGMENT,

diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
@@ -33,7 +33,7 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY
                                                  body_in,
                                              ReduceParams reduce_params)
 {
-  LaunchContext ctx;
+  LaunchContext ctx(threadIdx, blockDim);
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -143,7 +143,7 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
                                                 body_in,
                                             ReduceParams reduce_params)
 {
-  LaunchContext ctx;
+  LaunchContext ctx(threadIdx, blockDim);
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -245,6 +245,45 @@ struct LaunchExecute<
   }
 };
 
+/*
+  Loop methods which rely on a copy of threaIdx/BlockDim
+  for performance. In collaboration with AMD we have have this
+  to be more performat.
+*/
+
+namespace expt
+{
+
+template<named_dim DIM>
+struct cuda_ctx_thread_loop;
+
+using cuda_ctx_thread_loop_x = cuda_ctx_thread_loop<named_dim::x>;
+using cuda_ctx_thread_loop_y = cuda_ctx_thread_loop<named_dim::y>;
+using cuda_ctx_thread_loop_z = cuda_ctx_thread_loop<named_dim::z>;
+
+}  // namespace expt
+
+template<typename SEGMENT, named_dim DIM>
+struct LoopExecute<expt : cuda_ctx_thread_loop<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
+  {
+
+    const int len         = segment.end() - segment.begin();
+    constexpr int int_dim = static_cast<int>(DIM);
+
+    for (int i = ::RAJA::internal::CudaDimHelper<DIM>::get(ctx.thread_id);
+         i < len; i += ::RAJA::internal::CudaDimHelper<DIM>::get(ctx.block_dim))
+    {
+      body(*(segment.begin() + i));
+    }
+  }
+};
+
 /*
    CUDA generic loop implementations
 */