llnl · artv3 · Nov 25, 2025 · Nov 25, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
@@ -185,15 +185,28 @@ class LaunchContext
 
   void* shared_mem_ptr;
 
+  const size_t thread_id[3];
+  const size_t block_dim[3];
+
 #if defined(RAJA_ENABLE_SYCL)
   mutable ::sycl::nd_item<3>* itm;
 #endif
 
-  RAJA_HOST_DEVICE LaunchContext()
+ RAJA_HOST_DEVICE LaunchContext()
       : shared_mem_offset(0),
-        shared_mem_ptr(nullptr)
+        shared_mem_ptr(nullptr),
+        thread_id{1, 1, 1},
+        block_dim{1, 1, 1}
   {}
 
+  RAJA_HOST_DEVICE LaunchContext(const size_t tx, const size_t ty, const size_t tz,
+  const size_t bx, const size_t by, const size_t bz)
+      : shared_mem_offset(0),
+        shared_mem_ptr(nullptr),
+        thread_id{tx, ty, tz},
+        block_dim{bx, by, bz}
+  {}        
+
   // TODO handle alignment
   template<typename T>
   RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)

diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
@@ -32,7 +32,8 @@ template<typename BODY, typename ReduceParams>
 __global__ void launch_new_reduce_global_fcn(const BODY body_in,
                                              ReduceParams reduce_params)
 {
-  LaunchContext ctx;
+  LaunchContext ctx(threadIdx.x, threadIdx.y, threadIdx.z,
+                    blockDim.x, blockDim.y, blockDim.z);
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -137,7 +138,8 @@ __launch_bounds__(num_threads, 1) __global__
     void launch_new_reduce_global_fcn_fixed(const BODY body_in,
                                             ReduceParams reduce_params)
 {
-  LaunchContext ctx;
+  LaunchContext ctx(threadIdx.x, threadIdx.y, threadIdx.z,
+                    blockDim.x, blockDim.y, blockDim.z);
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -239,6 +241,40 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
   }
 };
 
+template<named_dim DIM>
+struct hip_ctx_thread_loop;
+
+using hip_ctx_thread_loop_x = hip_ctx_thread_loop<named_dim::x>;
+using hip_ctx_thread_loop_y = hip_ctx_thread_loop<named_dim::y>;
+using hip_ctx_thread_loop_z = hip_ctx_thread_loop<named_dim::z>;
+
+template<typename SEGMENT, named_dim DIM>
+struct LoopExecute<hip_ctx_thread_loop<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    constexpr int int_dim = static_cast<int>(DIM);
+
+    //for(int i=::RAJA::internal::HipDimHelper<DIM>::get(threadIdx);
+    for(int i = ctx.thread_id[int_dim];
+          i < len;
+        i+=ctx.block_dim[int_dim])
+        //i+=4)
+    {
+         body(*(segment.begin() + i));
+    }
+
+  }
+};
+
+
+
 /*
    HIP generic loop implementations
 */