Skip to content
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
cd5065e
initial commit for launch loop optimization
artv3 Nov 25, 2025
484ff1a
add structs to store gpu thread/block info in launch ctx
artv3 Nov 25, 2025
18f332b
add cuda variant and add build guards for cpu
artv3 Dec 2, 2025
21f6184
Merge branch 'develop' into artv3/launch-loop-opt
artv3 Dec 2, 2025
73f224a
rework to support dim3 copy in ctx
artv3 Dec 11, 2025
8a02fee
Merge branch 'artv3/launch-loop-opt' of https://github.com/LLNL/RAJA …
artv3 Dec 11, 2025
1fbe50b
minor clean up pass
artv3 Dec 11, 2025
672889e
make format
artv3 Dec 11, 2025
5908a20
Update include/RAJA/pattern/launch/launch_core.hpp
artv3 Dec 11, 2025
316e019
Merge branch 'develop' into artv3/launch-loop-opt
rhornung67 Dec 15, 2025
4d9f800
clean up pass
artv3 Dec 18, 2025
d9ce271
update with develop and fix merge conflicts
artv3 Dec 18, 2025
85aef5a
fix build error
artv3 Dec 18, 2025
0469302
take develop submodule
artv3 Dec 18, 2025
4a695f2
cuda backend
artv3 Dec 18, 2025
f91a498
make style
artv3 Dec 18, 2025
d21c41f
omp backend
artv3 Dec 18, 2025
40a5c1b
seq backend + make style
artv3 Dec 18, 2025
e0f4825
clean up pass
artv3 Dec 18, 2025
96e99d5
Update include/RAJA/pattern/launch/launch_context_policy.hpp
artv3 Dec 18, 2025
a9f0cca
minor clean up
artv3 Dec 18, 2025
7d4595b
minor clean up
artv3 Dec 18, 2025
c23f76f
Merge branch 'artv3/launch-loop-opt' of github.com:LLNL/RAJA into art…
artv3 Dec 18, 2025
c990a4f
revert changes to example
artv3 Dec 18, 2025
f7939fd
remove specialization from launch policy
artv3 Dec 18, 2025
c24331c
make work for function pointers
artv3 Dec 18, 2025
0518138
store dim3 based on launch context type - hip
artv3 Dec 19, 2025
d5da29a
rework omp backend
artv3 Dec 19, 2025
af88dbb
update sequential backend
artv3 Dec 19, 2025
21ad0a8
get things building for cuda -- need a good clean up pass
artv3 Dec 19, 2025
646a95b
cuda clean up pass
artv3 Dec 19, 2025
597641b
clean up ordering in hip launch
artv3 Dec 19, 2025
5403737
clean up ordering
artv3 Dec 19, 2025
e41e970
make style
artv3 Dec 19, 2025
7c95430
use constexpt for getting dim values
artv3 Dec 19, 2025
bfe72de
merge develop, fix conflict
artv3 Jan 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions include/RAJA/pattern/launch/launch_context_policy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*!
******************************************************************************
*
* \file
*
* \brief RAJA header file containing the core components of RAJA::launch
*
******************************************************************************
*/

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
// and RAJA project contributors. See the RAJA/LICENSE file for details.
//
// SPDX-License-Identifier: (BSD-3-Clause)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

#ifndef RAJA_pattern_context_policy_HPP
#define RAJA_pattern_context_policy_HPP

namespace RAJA
{

class LaunchContextDefaultPolicy;

#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE)
class LaunchContextDim3Policy;
#endif

} // namespace RAJA
#endif
57 changes: 40 additions & 17 deletions include/RAJA/pattern/launch/launch_core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "RAJA/config.hpp"
#include "RAJA/internal/get_platform.hpp"
#include "RAJA/pattern/launch/launch_context_policy.hpp"
#include "RAJA/util/StaticLayout.hpp"
#include "RAJA/util/macros.hpp"
#include "RAJA/util/plugins.hpp"
Expand Down Expand Up @@ -176,21 +177,20 @@ struct LaunchParams
Threads apply(Threads const& a) { return (threads = a); }
};

class LaunchContext
class LaunchContextBase
{
public:
// Bump style allocator used to
// get memory from the pool
size_t shared_mem_offset;

void* shared_mem_ptr;

#if defined(RAJA_SYCL_ACTIVE)
// SGS ODR issue
mutable ::sycl::nd_item<3>* itm;
#endif

RAJA_HOST_DEVICE LaunchContext()
RAJA_HOST_DEVICE LaunchContextBase()
: shared_mem_offset(0),
shared_mem_ptr(nullptr)
{}
Expand All @@ -209,20 +209,6 @@ class LaunchContext
return static_cast<T*>(mem_ptr);
}

/*
//Odd dependecy with atomics is breaking CI builds
template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t
z_stride=DIM-1, typename arg, typename... args> RAJA_HOST_DEVICE auto
getSharedMemoryView(size_t bytes, arg idx, args... idxs)
{
T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];

shared_mem_offset += bytes*sizeof(T);
return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx,
idxs...);
}
*/

RAJA_HOST_DEVICE void releaseSharedMemory()
{
// On the cpu/gpu we want to restart the count
Expand All @@ -243,6 +229,43 @@ class LaunchContext
}
};

template<typename LaunchContextPolicy>
class LaunchContextT;

template<>
class LaunchContextT<LaunchContextDefaultPolicy> : public LaunchContextBase
{
public:
static constexpr bool hasDim3 = false;

using LaunchContextBase::LaunchContextBase;
};

// Preserve backwards compatibility
using LaunchContext = LaunchContextT<LaunchContextDefaultPolicy>;

#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE)
template<>
class LaunchContextT<LaunchContextDim3Policy> : public LaunchContextBase
{
public:
static constexpr bool hasDim3 = true;

dim3 thread_id;
dim3 block_dim;

RAJA_HOST_DEVICE
LaunchContextT() : LaunchContextBase(), thread_id(), block_dim() {}

RAJA_HOST_DEVICE
LaunchContextT(dim3 thread, dim3 block)
: LaunchContextBase(),
thread_id(thread),
block_dim(block)
{}
};
#endif

template<typename LAUNCH_POLICY>
struct LaunchExecute;

Expand Down
93 changes: 80 additions & 13 deletions include/RAJA/policy/cuda/launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,33 +28,42 @@
namespace RAJA
{

template<typename BODY, typename ReduceParams>
template<typename BODY, typename LaunchContextPolicy, typename ReduceParams>
__global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY
body_in,
ReduceParams reduce_params)
{
LaunchContext ctx;

using RAJA::internal::thread_privatize;
auto privatizer = thread_privatize(body_in);
auto& body = privatizer.get_priv();

// Set pointer to shared memory
extern __shared__ char raja_shmem_ptr[];
ctx.shared_mem_ptr = raja_shmem_ptr;

RAJA::expt::invoke_body(reduce_params, body, ctx);
if constexpr (LaunchContextT<LaunchContextPolicy>::hasDim3)
{
LaunchContextT<LaunchContextPolicy> ctx(threadIdx, blockDim);
ctx.shared_mem_ptr = raja_shmem_ptr;
RAJA::expt::invoke_body(reduce_params, body, ctx);
}
else
{
LaunchContextT<LaunchContextPolicy> ctx;
ctx.shared_mem_ptr = raja_shmem_ptr;
RAJA::expt::invoke_body(reduce_params, body, ctx);
}

// Using a flatten global policy as we may use all dimensions
RAJA::expt::ParamMultiplexer::parampack_combine(
RAJA::cuda_flatten_global_xyz_direct {}, reduce_params);
}

template<bool async>
template<bool async, typename LaunchContextPolicy>
struct LaunchExecute<
RAJA::policy::cuda::cuda_launch_explicit_t<async,
named_usage::unspecified,
named_usage::unspecified>>
named_usage::unspecified,
LaunchContextPolicy>>
{

template<typename BODY_IN, typename ReduceParams>
Expand All @@ -72,7 +81,8 @@ struct LaunchExecute<
EXEC_POL pol {};

auto func = reinterpret_cast<const void*>(
&launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams>>);
&launch_new_reduce_global_fcn<BODY, LaunchContextPolicy,
camp::decay<ReduceParams>>);

resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();

Expand Down Expand Up @@ -137,32 +147,48 @@ struct LaunchExecute<
template<typename BODY,
int num_threads,
size_t BLOCKS_PER_SM,
typename LaunchContextPolicy,
typename ReduceParams>
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY
body_in,
ReduceParams reduce_params)
{
LaunchContext ctx;

using RAJA::internal::thread_privatize;
auto privatizer = thread_privatize(body_in);
auto& body = privatizer.get_priv();

// Set pointer to shared memory
extern __shared__ char raja_shmem_ptr[];
ctx.shared_mem_ptr = raja_shmem_ptr;

RAJA::expt::invoke_body(reduce_params, body, ctx);
if constexpr (LaunchContextT<LaunchContextPolicy>::hasDim3)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to use your new type traits here as well? similar to the HIP backend

{
LaunchContextT<LaunchContextPolicy> ctx(threadIdx, blockDim);
ctx.shared_mem_ptr = raja_shmem_ptr;
RAJA::expt::invoke_body(reduce_params, body, ctx);
}
else
{
LaunchContextT<LaunchContextPolicy> ctx;
ctx.shared_mem_ptr = raja_shmem_ptr;
RAJA::expt::invoke_body(reduce_params, body, ctx);
}

// Using a flatten global policy as we may use all dimensions
RAJA::expt::ParamMultiplexer::parampack_combine(
RAJA::cuda_flatten_global_xyz_direct {}, reduce_params);
}

template<bool async, int nthreads, size_t BLOCKS_PER_SM>
template<bool async,
int nthreads,
size_t BLOCKS_PER_SM,
typename LaunchContextPolicy>
struct LaunchExecute<
RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>
RAJA::policy::cuda::cuda_launch_explicit_t<async,
nthreads,
BLOCKS_PER_SM,
LaunchContextPolicy>>
{

template<typename BODY_IN, typename ReduceParams>
Expand All @@ -183,6 +209,7 @@ struct LaunchExecute<

auto func = reinterpret_cast<const void*>(
&launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM,
LaunchContextPolicy,
camp::decay<ReduceParams>>);

resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
Expand Down Expand Up @@ -245,6 +272,46 @@ struct LaunchExecute<
}
};

/*
Loop methods which rely on a copy of threaIdx/BlockDim
for performance. In collaboration with AMD we have have this
to be more performat.
*/

namespace expt
{

template<named_dim DIM>
struct cuda_ctx_thread_loop;

using cuda_ctx_thread_loop_x = cuda_ctx_thread_loop<named_dim::x>;
using cuda_ctx_thread_loop_y = cuda_ctx_thread_loop<named_dim::y>;
using cuda_ctx_thread_loop_z = cuda_ctx_thread_loop<named_dim::z>;

} // namespace expt

template<typename SEGMENT, named_dim DIM>
struct LoopExecute<expt::cuda_ctx_thread_loop<DIM>, SEGMENT>
{

template<typename BODY>
static RAJA_INLINE RAJA_DEVICE void exec(
LaunchContextT<LaunchContextDim3Policy> const& ctx,
SEGMENT const& segment,
BODY const& body)
{

const int len = segment.end() - segment.begin();
constexpr int int_dim = static_cast<int>(DIM);

for (int i = ::RAJA::internal::CudaDimHelper<DIM>::get(ctx.thread_id);
i < len; i += ::RAJA::internal::CudaDimHelper<DIM>::get(ctx.block_dim))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If constexpr to get the values based on StoreDim3?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you share an example?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See line 165 above if constexpr (LaunchContextT<LaunchContextPolicy>::hasDim3)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to echo that I think this is a good idea

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if possible though because ctx is a function parameter

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We were able to get the argument type if there is one operator() that is not templated and has at least one argument. If that isn't true then it will use the default.

{
body(*(segment.begin() + i));
}
}
};

/*
CUDA generic loop implementations
*/
Expand Down
15 changes: 10 additions & 5 deletions include/RAJA/policy/cuda/policy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <utility>

#include "RAJA/pattern/reduce.hpp"
#include "RAJA/pattern/launch/launch_context_policy.hpp"

#include "RAJA/policy/PolicyBase.hpp"
#include "RAJA/policy/sequential/policy.hpp"
Expand Down Expand Up @@ -361,8 +362,9 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
};

template<bool Async,
int num_threads = named_usage::unspecified,
size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
int num_threads = named_usage::unspecified,
size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
typename LaunchContextPolicy = LaunchContextDefaultPolicy>
struct cuda_launch_explicit_t
: public RAJA::make_policy_pattern_launch_platform_t<
RAJA::Policy::cuda,
Expand Down Expand Up @@ -1725,13 +1727,16 @@ using policy::cuda::cuda_synchronize;

// policies usable with launch
template<bool Async,
int num_threads = named_usage::unspecified,
size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
int num_threads = named_usage::unspecified,
size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
typename LaunchContextPolicy = LaunchContextDefaultPolicy>
using cuda_launch_explicit_t =
policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;

// CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
template<bool Async, int num_threads = named_usage::unspecified>
template<bool Async,
int num_threads = named_usage::unspecified,
typename LaunchContextPolicy = LaunchContextDefaultPolicy>
using cuda_launch_t =
policy::cuda::cuda_launch_explicit_t<Async,
num_threads,
Expand Down
Loading
Loading