-
Notifications
You must be signed in to change notification settings - Fork 110
Thread loop optimizations RAJA launch #1949
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 8 commits
cd5065e
484ff1a
18f332b
21f6184
73f224a
8a02fee
1fbe50b
672889e
5908a20
316e019
4d9f800
d9ce271
85aef5a
0469302
4a695f2
f91a498
d21c41f
40a5c1b
e0f4825
96e99d5
a9f0cca
7d4595b
c23f76f
c990a4f
f7939fd
c24331c
0518138
d5da29a
af88dbb
21ad0a8
646a95b
597641b
5403737
e41e970
7c95430
bfe72de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,7 +33,7 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY | |
| body_in, | ||
| ReduceParams reduce_params) | ||
| { | ||
| LaunchContext ctx; | ||
| LaunchContext ctx(threadIdx, blockDim); | ||
|
|
||
| using RAJA::internal::thread_privatize; | ||
| auto privatizer = thread_privatize(body_in); | ||
|
|
@@ -143,7 +143,7 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ | |
| body_in, | ||
| ReduceParams reduce_params) | ||
| { | ||
| LaunchContext ctx; | ||
| LaunchContext ctx(threadIdx, blockDim); | ||
|
|
||
| using RAJA::internal::thread_privatize; | ||
| auto privatizer = thread_privatize(body_in); | ||
|
|
@@ -245,6 +245,45 @@ struct LaunchExecute< | |
| } | ||
| }; | ||
|
|
||
| /* | ||
| Loop methods which rely on a copy of threaIdx/BlockDim | ||
| for performance. In collaboration with AMD we have have this | ||
| to be more performat. | ||
| */ | ||
|
|
||
| namespace expt | ||
| { | ||
|
|
||
| template<named_dim DIM> | ||
| struct cuda_ctx_thread_loop; | ||
|
|
||
| using cuda_ctx_thread_loop_x = cuda_ctx_thread_loop<named_dim::x>; | ||
| using cuda_ctx_thread_loop_y = cuda_ctx_thread_loop<named_dim::y>; | ||
| using cuda_ctx_thread_loop_z = cuda_ctx_thread_loop<named_dim::z>; | ||
|
|
||
| } // namespace expt | ||
|
|
||
| template<typename SEGMENT, named_dim DIM> | ||
| struct LoopExecute<expt : cuda_ctx_thread_loop<DIM>, SEGMENT> | ||
| { | ||
|
|
||
| template<typename BODY> | ||
| static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx, | ||
artv3 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| SEGMENT const& segment, | ||
| BODY const& body) | ||
| { | ||
|
|
||
| const int len = segment.end() - segment.begin(); | ||
| constexpr int int_dim = static_cast<int>(DIM); | ||
|
|
||
| for (int i = ::RAJA::internal::CudaDimHelper<DIM>::get(ctx.thread_id); | ||
| i < len; i += ::RAJA::internal::CudaDimHelper<DIM>::get(ctx.block_dim)) | ||
|
||
| { | ||
| body(*(segment.begin() + i)); | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| /* | ||
| CUDA generic loop implementations | ||
| */ | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.