diff --git a/examples/4D_tensor_contraction.cpp b/examples/4D_tensor_contraction.cpp
new file mode 100644
index 0000000000..51ffea53a6
--- /dev/null
+++ b/examples/4D_tensor_contraction.cpp
@@ -0,0 +1,191 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "camp/resource.hpp"
+#include "memoryManager.hpp"
+
+
+/*
+ * RAJA Launch Example: Upper Triangular Pattern + Shared Memory
+ *
+ * Launch introduces hierarchical parallelism through the concept of
+ * teams and threads.  Computation is executed in a pre-defined grid
+ * composed of threads and grouped into teams. The teams model enables
+ * developers to express parallelism through loops over teams, and inner loops
+ * over threads. Team loops are executed in parallel and
+ * threads within a team should be treated as sub-parallel regions.
+ *
+ * Team shared memory is allocated between team and thread loops.
+ * Memory allocated within thread loops are thread private.
+ * The example below demonstrates composing an upper triangular
+ * loop pattern, and using shared memory.
+ *
+ */
+
+// Define problem setup
+constexpr int TotalMats = 100;
+
+constexpr int I = 2;
+constexpr int J = 2;
+constexpr int L = 2;
+constexpr int K = 2;
+constexpr int M = 2;
+constexpr int N = 2;
+constexpr int O = 2;
+
+/*
+ * Define host/device launch policies
+ */
+const bool async = false;
+using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t
+#if defined(RAJA_ENABLE_HIP)
+                                         ,RAJA::hip_launch_t<async>
+#endif
+                                         >;
+
+using teams = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                               ,RAJA::hip_block_x_direct
+#endif
+                               >;
+
+using loop_0 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::hip_loop_dim_exec<0>
+#endif
+                                >;
+using loop_1 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::hip_loop_dim_exec<1>
+#endif
+                                >;
+using loop_2 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::hip_loop_dim_exec<2>
+#endif
+                                >;
+using loop_3 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::hip_loop_dim_exec<3>
+#endif
+                                >;
+
+using loop_4 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::hip_loop_dim_exec<4>
+#endif
+                                >;
+using loop_5 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::hip_loop_dim_exec<5>
+#endif
+                                >;
+
+
+template<typename AVIEW, typename BVIEW, typename CVIEW>
+void tensor_contraction(AVIEW A, BVIEW B, CVIEW C, RAJA::ExecPlace platform)
+{
+
+  RAJA::launch<launch_policy>
+    (platform, RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+         RAJA::loop<teams>(ctx, RAJA::RangeSegment(0, TotalMats), [&](int r) {
+
+           RAJA::loop<loop_5>(ctx, RAJA::RangeSegment(0, I), [&](int i) {
+             RAJA::loop<loop_4>(ctx, RAJA::RangeSegment(0, J), [&](int j) {
+               RAJA::loop<loop_3>(ctx, RAJA::RangeSegment(0, K), [&](int k) {
+                 RAJA::loop<loop_2>(ctx, RAJA::RangeSegment(0, M), [&](int m) {
+                   RAJA::loop<loop_1>(ctx, RAJA::RangeSegment(0, N), [&](int n) {
+                     RAJA::loop<loop_0>(ctx, RAJA::RangeSegment(0, O), [&](int o) {
+
+                       double dot = 0.0;
+                       for(int l = 0; l < L; ++l) {
+                         dot += A(r, i,j,k,l) * B(r, l,m,n,o);
+                       }
+                       C(r, i,j,k,m,n,o) = dot;
+
+                      });
+                    });
+                  });
+                });
+              });
+            });
+          });
+
+       });  // outer lambda
+
+
+}
+
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+
+  double *Aptr = memoryManager::allocate<double>(TotalMats * I * J * K * L);
+  double *Bptr = memoryManager::allocate<double>(TotalMats * L * M * N * O);
+  double *Cptr = memoryManager::allocate<double>(TotalMats * I * J * K * M * N * O);
+
+  double *test_Cptr = memoryManager::allocate<double>(TotalMats * I * J * K * M * N * O);
+
+  auto A = RAJA::make_permuted_view<RAJA::layout_right>(Aptr, TotalMats, I, J, K, L);
+  auto B = RAJA::make_permuted_view<RAJA::layout_right>(Bptr, TotalMats, L, M, N, O);
+  auto C = RAJA::make_permuted_view<RAJA::layout_right>(Cptr, TotalMats, I, J, K, M, N, O);
+  auto test_C = RAJA::make_permuted_view<RAJA::layout_right>(test_Cptr, TotalMats, I, J, K, M, N, O);
+
+  // Initialize A and B with some values
+  for(int mat = 0; mat < TotalMats; ++mat) {
+
+    for (int i = 0; i < I; i++) {
+      for (int j = 0; j < J; j++) {
+	for (int k = 0; k < K; k++) {
+	  for (int l = 0; l < L; l++) {
+	    A(mat, i, j, k, l) = 1.0;
+	  }
+	}
+      }
+    }
+
+    for (int l = 0; l < L; l++) {
+      for (int m = 0; m < M; m++) {
+	for (int n = 0; n < N; n++) {
+	  for (int o = 0; o < O; o++) {
+	    B(mat, l, m, n, o) = 1.0;
+	  }
+	}
+      }
+    }
+
+  }
+
+
+
+  tensor_contraction(A, B, C, RAJA::ExecPlace::HOST);
+
+  tensor_contraction(A, B, test_C, RAJA::ExecPlace::DEVICE);
+
+
+  //test correctness
+  double diff = 0.0;
+  for(int i = 0; i < I * J * K * M * N * O; ++i) {
+    diff += fabs(Cptr[i] - test_Cptr[i]);
+  }
+
+  std::cout<<"diff = "<<diff<<std::endl;
+
+
+
+
+}  // Main
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 654f7a45ae..0c4dcd20ca 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -67,6 +67,14 @@ raja_add_executable(
   NAME pi-reduce_vs_atomic
   SOURCES pi-reduce_vs_atomic.cpp)
 
+raja_add_executable(
+  NAME launch-high-dim-threads
+  SOURCES launch-high-dim-threads.cpp)
+
+raja_add_executable(
+  NAME 4D_tensor_contraction
+  SOURCES 4D_tensor_contraction.cpp)
+
 raja_add_executable(
   NAME raja-launch
   SOURCES raja-launch.cpp)
diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp
new file mode 100644
index 0000000000..a3c41c51ff
--- /dev/null
+++ b/examples/launch-high-dim-threads.cpp
@@ -0,0 +1,196 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "camp/resource.hpp"
+
+
+/*
+ * RAJA Launch Example: Upper Triangular Pattern + Shared Memory
+ *
+ * Launch introduces hierarchical parallelism through the concept of
+ * teams and threads.  Computation is executed in a pre-defined grid
+ * composed of threads and grouped into teams. The teams model enables
+ * developers to express parallelism through loops over teams, and inner loops
+ * over threads. Team loops are executed in parallel and
+ * threads within a team should be treated as sub-parallel regions.
+ *
+ * Team shared memory is allocated between team and thread loops.
+ * Memory allocated within thread loops are thread private.
+ * The example below demonstrates composing an upper triangular
+ * loop pattern, and using shared memory.
+ *
+ */
+
+/*
+ * Define host/device launch policies
+ */
+using launch_policy = RAJA::LaunchPolicy<
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::omp_launch_t
+#else
+    RAJA::seq_launch_t
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+    ,
+    RAJA::cuda_launch_t<false>
+#endif
+#if defined(RAJA_ENABLE_HIP)
+    ,
+    RAJA::hip_launch_t<false>
+#endif
+    >;
+
+/*
+ * Define team policies.
+ * Up to 3 dimension are supported: x,y,z
+ */
+using teams_x = RAJA::LoopPolicy<
+#if defined(RAJA_ENABLE_OPENMP)
+                                       RAJA::omp_parallel_for_exec
+#else
+                                       RAJA::seq_exec
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+                                       ,
+                                       RAJA::cuda_block_x_direct
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                                       ,
+                                       RAJA::hip_block_x_direct
+#endif
+                                       >;
+/*
+ * Define thread policies.
+ * Up to 3 dimension are supported: x,y,z
+ */
+using threads_x = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_CUDA)
+                                         ,
+                                         RAJA::cuda_thread_x_loop
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                                         ,
+                                         RAJA::hip_thread_x_loop
+#endif
+                                         >;
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  // Resource object for host
+  camp::resources::Host host_res;
+
+  // Resource objects for CUDA or HIP
+#if defined(RAJA_ENABLE_CUDA)
+  camp::resources::Cuda device_res;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  camp::resources::Hip device_res;
+#endif
+
+  std::cout << "\n Running RAJA-Launch examples...\n";
+  int num_of_backends = 1;
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+  num_of_backends++;
+#endif
+
+  // RAJA teams may switch between host and device policies at run time.
+  // The loop below will execute through the available backends.
+
+  for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) {
+
+    auto select_cpu_or_gpu = (RAJA::ExecPlace)exec_place;
+
+    // Allocate memory for either host or device
+    int N_tri = 5;
+
+    int* Ddat = nullptr;
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
+      Ddat = host_res.allocate<int>(N_tri * N_tri);
+    }
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+      Ddat = device_res.allocate<int>(N_tri * N_tri);
+    }
+#endif
+
+    /*
+     * RAJA::launch just starts a "kernel" and doesn't provide any looping.
+     *
+     * The first argument determines which policy should be executed,
+     *
+     * The second argument is the number of teams+threads needed for each of the
+     * policies.
+     *
+     * Third argument is the lambda.
+     *
+     * The lambda takes a "resource" object, which has the teams+threads
+     * and is used to perform thread synchronizations within a team.
+     */
+
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST){
+      std::cout << "\n Running upper triangular pattern example on the host...\n";
+    } else {
+      std::cout << "\n Running upper triangular pattern example on the device...\n";
+    }
+
+    RAJA::LaunchParams{};
+    
+    //Example of high-dimensional thread-team
+    RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4));
+    
+    RAJA::View<int, RAJA::Layout<2>> D(Ddat, N_tri, N_tri);
+
+    RAJA::launch<launch_policy>
+      (select_cpu_or_gpu,
+       RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)),
+       //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+         printf("in kernel \n");
+         RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
+
+           // Array shared within threads of the same team
+           RAJA_TEAM_SHARED int s_A[1];
+
+           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
+              s_A[c] = r;
+           });  // loop c
+
+           ctx.teamSync();
+
+           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
+               D(r, c) = r * N_tri + c;
+               printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
+           });  // loop c
+
+         });  // loop r
+
+       });  // outer lambda
+
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
+      host_res.deallocate(Ddat);
+    }
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+      device_res.deallocate(Ddat);
+    }
+#endif
+
+  }  // Execution places loop
+
+
+}  // Main
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index bfce94057c..8a580fb801 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -107,10 +107,13 @@ struct Teams
   constexpr Teams(int i, int j, int k) : value {i, j, k} {}
 };
 
+template<size_t DIM=3>
 struct Threads
 {
-  int value[3];
 
+  std::array<int, DIM> value;
+  // int value[DIM];
+#if 1
   RAJA_INLINE
 
   RAJA_HOST_DEVICE
@@ -130,35 +133,26 @@ struct Threads
 
   RAJA_HOST_DEVICE
   constexpr Threads(int i, int j, int k) : value {i, j, k} {}
-};
-
-struct Lanes
-{
-  int value;
-
-  RAJA_INLINE
-
-  RAJA_HOST_DEVICE
-  constexpr Lanes() : value(0) {}
 
-  RAJA_INLINE
-
-  RAJA_HOST_DEVICE
-  constexpr Lanes(int i) : value(i) {}
+  //#else
+  template<typename... Args>
+  constexpr Threads(Args... args) : value {static_cast<int>(args)...} {};
+#endif
 };
 
+template<size_t ThreadDIM = 3>
 struct LaunchParams
 {
 public:
   Teams teams;
-  Threads threads;
+  Threads<ThreadDIM> threads;
   size_t shared_mem_size;
 
   RAJA_INLINE
   LaunchParams() = default;
 
   LaunchParams(Teams in_teams,
-               Threads in_threads,
+               Threads<ThreadDIM> in_threads,
                size_t in_shared_mem_size = 0)
       : teams(in_teams),
         threads(in_threads),
@@ -170,10 +164,12 @@ struct LaunchParams
   RAJA_INLINE
   Teams apply(Teams const& a) { return (teams = a); }
 
+  /*
   RAJA_HOST_DEVICE
 
   RAJA_INLINE
   Threads apply(Threads const& a) { return (threads = a); }
+  */
 };
 
 class LaunchContext
@@ -185,6 +181,10 @@ class LaunchContext
 
   void* shared_mem_ptr;
 
+  //hardcoded for now...
+  std::array<int, 6> thread_dim;
+  std::array<int, 6> thread_id;
+
 #if defined(RAJA_ENABLE_SYCL)
   mutable ::sycl::nd_item<3>* itm;
 #endif
@@ -246,8 +246,8 @@ struct LaunchExecute;
 
 // Duplicate of code above on account that we need to support the case in which
 // a kernel_name is not given
-template<typename LAUNCH_POLICY, typename... ReduceParams>
-void launch(LaunchParams const& launch_params,
+template<typename LAUNCH_POLICY, size_t ThreadDIM = 3, typename... ReduceParams>
+void launch(LaunchParams<ThreadDIM> const& launch_params,
             ReduceParams&&... rest_of_launch_args)
 {
   // Get reducers
@@ -288,17 +288,19 @@ void launch(LaunchParams const& launch_params,
 //=================================================
 // Run time based policy launch
 //=================================================
-template<typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, LaunchParams const& params, BODY const& body)
+template<typename POLICY_LIST, size_t ThreadDIM = 3, typename BODY>
+void launch(ExecPlace place,
+            LaunchParams<ThreadDIM> const& params,
+            BODY const& body)
 {
   launch<POLICY_LIST>(place, params, body);
 }
 
 // Run-time API for new reducer interface with support of the case without a new
 // kernel name
-template<typename POLICY_LIST, typename... ReduceParams>
+template<typename POLICY_LIST, size_t ThreadDIM = 3, typename... ReduceParams>
 void launch(ExecPlace place,
-            const LaunchParams& launch_params,
+            LaunchParams<ThreadDIM> const& launch_params,
             ReduceParams&&... rest_of_launch_args)
 // BODY const &body)
 {
@@ -367,10 +369,10 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
 
 // Duplicate of API above on account that we need to handle the case that a
 // kernel name is not provided
-template<typename POLICY_LIST, typename... ReduceParams>
+template<typename POLICY_LIST, size_t ThreadDIM = 3, typename... ReduceParams>
 resources::EventProxy<resources::Resource> launch(
     RAJA::resources::Resource res,
-    LaunchParams const& launch_params,
+    LaunchParams<ThreadDIM> const& launch_params,
     ReduceParams&&... rest_of_launch_args)
 {
 
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index f0b7d0be98..6a54205ded 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -72,13 +72,13 @@ struct LaunchExecute<
                                                named_usage::unspecified>>
 {
 
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -88,6 +88,14 @@ struct LaunchExecute<
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
+
+    if (params.threads.value.size() > 3)
+    {
+      std::cout << "threads container is larger than 3 : "
+                << params.threads.value.size() << std::endl;
+    }
+
+
     //
     // Compute the number of blocks and threads
     //
@@ -96,10 +104,32 @@ struct LaunchExecute<
                          static_cast<cuda_dim_member_t>(params.teams.value[1]),
                          static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
+    cuda_dim_t blockSize;
+
+    if (params.threads.value.size() < 4)
+    {
+      blockSize =
+          cuda_dim_t {static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                      static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                      static_cast<cuda_dim_member_t>(params.threads.value[2])};
+    }
+    else
+    {
+
+      int total_threads = detail::multiplyArray(params.threads.value);
+      std::cout << "Total threads" << std::endl;
+      blockSize = cuda_dim_t {static_cast<cuda_dim_member_t>(
+                                  detail::multiplyArray(params.threads.value)),
+                              static_cast<cuda_dim_member_t>(1),
+                              static_cast<cuda_dim_member_t>(1)};
+    }
+
+    /*
     cuda_dim_t blockSize {
         static_cast<cuda_dim_member_t>(params.threads.value[0]),
         static_cast<cuda_dim_member_t>(params.threads.value[1]),
         static_cast<cuda_dim_member_t>(params.threads.value[2])};
+    */
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
@@ -129,19 +159,23 @@ struct LaunchExecute<
 
       RAJA_FT_END;
     }
+    else
+    {
+      std::cout << "did not launch kernel " << std::endl;
+    }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
   // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams& launch_reducers)
   {
@@ -260,13 +294,13 @@ struct LaunchExecute<
     RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>
 {
 
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -323,14 +357,14 @@ struct LaunchExecute<
   }
 
   // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams& launch_reducers)
   {
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index ab27e9b456..32e5a6595b 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -28,6 +28,25 @@
 namespace RAJA
 {
 
+// internal helper function
+namespace detail
+{
+
+template<typename T, size_t... I>
+constexpr T multiply_impl(const std::array<T, sizeof...(I)>& arr,
+                          std::index_sequence<I...>)
+{
+  return (arr[I] * ...);
+}
+
+template<typename T, size_t N>
+constexpr T multiplyArray(const std::array<T, N>& arr)
+{
+  return multiply_impl(arr, std::make_index_sequence<N> {});
+}
+
+}  // namespace detail
+
 template<typename BODY>
 __global__ void launch_global_fcn(BODY body_in)
 {
@@ -44,6 +63,37 @@ __global__ void launch_global_fcn(BODY body_in)
   body(ctx);
 }
 
+template<typename BODY>
+__global__ void launch_global_fcn_ctx(BODY body_in, LaunchContext ctx)
+{
+  //LaunchContext ctx;
+
+  //unravel index
+  int tid = threadIdx.x;
+  for (int d = ctx.thread_dim.size()-1; d >= 0; --d) {
+    ctx.thread_id[d] = tid % ctx.thread_dim[d];
+    tid /= ctx.thread_dim[d];
+  }
+
+  /*
+  if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0){
+    for(int i=0; i<ctx.threads.size(); ++i) {
+      printf("ctx.threads[i] = %d \n", ctx.threads[i]);
+    }
+  }
+  */
+
+  using RAJA::internal::thread_privatize;
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
+
+  // Set pointer to shared memory
+  extern __shared__ char raja_shmem_ptr[];
+  ctx.shared_mem_ptr = raja_shmem_ptr;
+
+  body(ctx);
+}
+
 template<typename BODY, typename ReduceParams>
 __global__ void launch_new_reduce_global_fcn(BODY body_in,
                                              ReduceParams reduce_params)
@@ -70,22 +120,29 @@ struct LaunchExecute<
     RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>>
 {
 
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(&launch_global_fcn_ctx<BODY>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
+    if (params.threads.value.size() > 3)
+    {
+      std::cout << "threads container is larger than 3 : "
+                << params.threads.value.size() << std::endl;
+    }
+
+
     //
     // Compute the number of blocks and threads
     //
@@ -94,10 +151,33 @@ struct LaunchExecute<
                         static_cast<hip_dim_member_t>(params.teams.value[1]),
                         static_cast<hip_dim_member_t>(params.teams.value[2])};
 
+    /*
     hip_dim_t blockSize {
         static_cast<hip_dim_member_t>(params.threads.value[0]),
         static_cast<hip_dim_member_t>(params.threads.value[1]),
         static_cast<hip_dim_member_t>(params.threads.value[2])};
+    */
+
+    hip_dim_t blockSize;
+
+    if (params.threads.value.size() < 4)
+    {
+      blockSize =
+          hip_dim_t {static_cast<hip_dim_member_t>(params.threads.value[0]),
+                      static_cast<hip_dim_member_t>(params.threads.value[1]),
+                      static_cast<hip_dim_member_t>(params.threads.value[2])};
+    }
+    else
+    {
+
+      int total_threads = detail::multiplyArray(params.threads.value);
+      std::cout << "Total threads" << std::endl;
+      blockSize = hip_dim_t {static_cast<hip_dim_member_t>(
+                                  detail::multiplyArray(params.threads.value)),
+                              static_cast<hip_dim_member_t>(1),
+                              static_cast<hip_dim_member_t>(1)};
+    }
+
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
@@ -109,7 +189,7 @@ struct LaunchExecute<
 
       {
         size_t shared_mem_size = params.shared_mem_size;
-
+        std::cout << "launching kernel " << std::endl;
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
@@ -117,29 +197,38 @@ struct LaunchExecute<
                                                 shared_mem_size, hip_res,
                                                 std::forward<BODY_IN>(body_in));
 
+        //Copy threads over
+        LaunchContext ctx;
+        ctx.thread_dim = params.threads.value;
+
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&body};
+        void* args[] = {(void*)&body, (void*)&ctx};
         RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
                           hip_res, async);
       }
 
       RAJA_FT_END;
     }
+    else
+    {
+
+      std::cout << "did not launch kernel " << std::endl;
+    }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
   // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams& launch_reducers)
   {
@@ -255,13 +344,13 @@ template<bool async, int nthreads>
 struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
 {
 
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -318,14 +407,14 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
   }
 
   // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams& launch_reducers)
   {
@@ -401,6 +490,30 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
   }
 };
 
+
+/*
+  Arbitrary dimension thread indexing
+*/
+template<size_t dim>
+struct hip_loop_dim_exec;
+
+template<typename SEGMENT, size_t DIM>
+struct LoopExecute<hip_loop_dim_exec<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const& segment,
+      BODY const& body)
+  {
+    const int i = ctx.thread_id[DIM];
+
+    body(*(segment.begin() + i));
+  }
+};
+
+
 /*
    HIP generic loop implementations
 */
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 2092c87bb3..a7cebc31c9 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -28,13 +28,13 @@ template<>
 struct LaunchExecute<RAJA::omp_launch_t>
 {
 
-  template<typename BODY, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& params,
+       LaunchParams<ThreadDIM> const& params,
        BODY const& body,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -55,14 +55,14 @@ struct LaunchExecute<RAJA::omp_launch_t>
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename ReduceParams, typename BODY>
+  template<typename ReduceParams, typename BODY, size_t ThreadDIM = 3>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
+       LaunchParams<ThreadDIM> const& launch_params,
        BODY const& body,
        ReduceParams& f_params)
   {
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index ee98804ecf..1026e78af2 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -40,13 +40,13 @@ template<>
 struct LaunchExecute<RAJA::seq_launch_t>
 {
 
-  template<typename BODY, typename ReduceParams>
+  template<typename BODY, size_t ThreadDIM = 3, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& params,
+       LaunchParams<ThreadDIM> const& params,
        BODY const& body,
        ReduceParams& RAJA_UNUSED_ARG(ReduceParams))
   {
@@ -64,14 +64,14 @@ struct LaunchExecute<RAJA::seq_launch_t>
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename BODY, typename ReduceParams>
+  template<typename BODY, size_t ThreadDIM = 3, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
+       LaunchParams<ThreadDIM> const& launch_params,
        BODY const& body,
        ReduceParams& launch_reducers)
   {
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index f69e2c4424..7dacc5105d 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -33,7 +33,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 {
 
   // If the launch lambda is trivially copyable
-  template<typename BODY_IN,
+  template<size_t ThreadDIM = 3,
+           typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -42,7 +43,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -102,7 +103,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
   // If the launch lambda is trivially copyable and we have explcit reduction
   // parameters
-  template<typename BODY_IN,
+  template<size_t ThreadDIM = 3,
+           typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -112,7 +114,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams launch_reducers)
   {
@@ -194,7 +196,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
   }
 
   // If the launch lambda is not trivially copyable
-  template<typename BODY_IN,
+  template<size_t ThreadDIM = 3,
+           typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -203,7 +206,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -269,7 +272,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
   }
 
   // If the launch lambda is not trivially copyable
-  template<typename BODY_IN,
+  template<size_t ThreadDIM = 3,
+           typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -279,7 +283,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams launch_reducers)
   {