Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
cd5065e
initial commit for launch loop optimization
artv3 Nov 25, 2025
484ff1a
add structs to store gpu thread/block info in launch ctx
artv3 Nov 25, 2025
18f332b
add cuda variant and add build guards for cpu
artv3 Dec 2, 2025
21f6184
Merge branch 'develop' into artv3/launch-loop-opt
artv3 Dec 2, 2025
73f224a
rework to support dim3 copy in ctx
artv3 Dec 11, 2025
8a02fee
Merge branch 'artv3/launch-loop-opt' of https://github.com/LLNL/RAJA …
artv3 Dec 11, 2025
1fbe50b
minor clean up pass
artv3 Dec 11, 2025
672889e
make format
artv3 Dec 11, 2025
5908a20
Update include/RAJA/pattern/launch/launch_core.hpp
artv3 Dec 11, 2025
316e019
Merge branch 'develop' into artv3/launch-loop-opt
rhornung67 Dec 15, 2025
4d9f800
clean up pass
artv3 Dec 18, 2025
d9ce271
update with develop and fix merge conflicts
artv3 Dec 18, 2025
85aef5a
fix build error
artv3 Dec 18, 2025
0469302
take develop submodule
artv3 Dec 18, 2025
4a695f2
cuda backend
artv3 Dec 18, 2025
f91a498
make style
artv3 Dec 18, 2025
d21c41f
omp backend
artv3 Dec 18, 2025
40a5c1b
seq backend + make style
artv3 Dec 18, 2025
e0f4825
clean up pass
artv3 Dec 18, 2025
96e99d5
Update include/RAJA/pattern/launch/launch_context_policy.hpp
artv3 Dec 18, 2025
a9f0cca
minor clean up
artv3 Dec 18, 2025
7d4595b
minor clean up
artv3 Dec 18, 2025
c23f76f
Merge branch 'artv3/launch-loop-opt' of github.com:LLNL/RAJA into art…
artv3 Dec 18, 2025
c990a4f
revert changes to example
artv3 Dec 18, 2025
f7939fd
remove specialization from launch policy
artv3 Dec 18, 2025
c24331c
make work for function pointers
artv3 Dec 18, 2025
0518138
store dim3 based on launch context type - hip
artv3 Dec 19, 2025
d5da29a
rework omp backend
artv3 Dec 19, 2025
af88dbb
update sequential backend
artv3 Dec 19, 2025
21ad0a8
get things building for cuda -- need a good clean up pass
artv3 Dec 19, 2025
646a95b
cuda clean up pass
artv3 Dec 19, 2025
597641b
clean up ordering in hip launch
artv3 Dec 19, 2025
5403737
clean up ordering
artv3 Dec 19, 2025
e41e970
make style
artv3 Dec 19, 2025
7c95430
use constexpt for getting dim values
artv3 Dec 19, 2025
bfe72de
merge develop, fix conflict
artv3 Jan 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 62 additions & 3 deletions include/RAJA/pattern/launch/launch_core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,25 +176,49 @@ struct LaunchParams
Threads apply(Threads const& a) { return (threads = a); }
};

class LaunchContext
template<bool StoreDim3 = false>
class LaunchContextT
{
public:
#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
// If StoreDim3 is true, store by value; else, don't store
typename std::conditional<StoreDim3, dim3, void*>::type thread_id;
typename std::conditional<StoreDim3, dim3, void*>::type block_dim;
#endif

// Bump style allocator used to
// get memory from the pool
size_t shared_mem_offset;

void* shared_mem_ptr;

#if defined(RAJA_SYCL_ACTIVE)
// SGS ODR issue
mutable ::sycl::nd_item<3>* itm;
#endif

RAJA_HOST_DEVICE LaunchContext()
RAJA_HOST_DEVICE LaunchContextT()
: shared_mem_offset(0),
shared_mem_ptr(nullptr)
{}

#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
// Only enable this constructor if StoreDim3 is true
template<bool S = StoreDim3, typename std::enable_if<S, int>::type = 0>
RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_)
: shared_mem_offset(0),
shared_mem_ptr(nullptr),
thread_id(thread_id_),
block_dim(block_id_)
{}

// Only enable this constructor if StoreDim3 is false
template<bool S = StoreDim3, typename std::enable_if<!S, int>::type = 0>
RAJA_HOST_DEVICE LaunchContextT(dim3 thread_id_, dim3 block_id_)
: shared_mem_offset(0),
shared_mem_ptr(nullptr)
{}
#endif

// TODO handle alignment
template<typename T>
RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
Expand Down Expand Up @@ -243,6 +267,9 @@ class LaunchContext
}
};

// Preserve backwards compatibility
using LaunchContext = LaunchContextT<false>;

template<typename LAUNCH_POLICY>
struct LaunchExecute;

Expand Down Expand Up @@ -479,6 +506,38 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment, body);
}

/*
template<typename POLICY_LIST, typename SEGMENT, typename BODY>
RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContext const& ctx, SEGMENT const&
segment, BODY const& body)
{
LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::template exec<BODY>(ctx,
segment, body);
}
*/

/*
template<typename POLICY_LIST, typename SEGMENT, typename BODY>
RAJA_HOST_DEVICE RAJA_INLINE void loop(LaunchContextT<true> const& ctx, SEGMENT
const& segment, BODY const& body)
{
LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::template exec<BODY>(ctx,
segment, body);
}
*/


/*
// Overload for other contexts
template<typename POLICY_LIST, typename CONTEXT, typename SEGMENT, typename
BODY> std::enable_if_t<!is_launch_context<CONTEXT>::value> loop(CONTEXT const&
ctx, SEGMENT const& segment, BODY const& body)
{
LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::template exec<BODY>(ctx,
segment, body);
}
*/

template<typename POLICY_LIST,
typename CONTEXT,
typename SEGMENT,
Expand Down
43 changes: 41 additions & 2 deletions include/RAJA/policy/cuda/launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY
body_in,
ReduceParams reduce_params)
{
LaunchContext ctx;
LaunchContext ctx(threadIdx, blockDim);

using RAJA::internal::thread_privatize;
auto privatizer = thread_privatize(body_in);
Expand Down Expand Up @@ -143,7 +143,7 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
body_in,
ReduceParams reduce_params)
{
LaunchContext ctx;
LaunchContext ctx(threadIdx, blockDim);

using RAJA::internal::thread_privatize;
auto privatizer = thread_privatize(body_in);
Expand Down Expand Up @@ -245,6 +245,45 @@ struct LaunchExecute<
}
};

/*
Loop methods which rely on a copy of threaIdx/BlockDim
for performance. In collaboration with AMD we have have this
to be more performat.
*/

namespace expt
{

template<named_dim DIM>
struct cuda_ctx_thread_loop;

using cuda_ctx_thread_loop_x = cuda_ctx_thread_loop<named_dim::x>;
using cuda_ctx_thread_loop_y = cuda_ctx_thread_loop<named_dim::y>;
using cuda_ctx_thread_loop_z = cuda_ctx_thread_loop<named_dim::z>;

} // namespace expt

template<typename SEGMENT, named_dim DIM>
struct LoopExecute<expt : cuda_ctx_thread_loop<DIM>, SEGMENT>
{

template<typename BODY>
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
SEGMENT const& segment,
BODY const& body)
{

const int len = segment.end() - segment.begin();
constexpr int int_dim = static_cast<int>(DIM);

for (int i = ::RAJA::internal::CudaDimHelper<DIM>::get(ctx.thread_id);
i < len; i += ::RAJA::internal::CudaDimHelper<DIM>::get(ctx.block_dim))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If constexpr to get the values based on StoreDim3?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you share an example?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See line 165 above if constexpr (LaunchContextT<LaunchContextPolicy>::hasDim3)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to echo that I think this is a good idea

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if possible though because ctx is a function parameter

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We were able to get the argument type if there is one operator() that is not templated and has at least one argument. If that isn't true then it will use the default.

{
body(*(segment.begin() + i));
}
}
};

/*
CUDA generic loop implementations
*/
Expand Down
Loading