diff --git a/examples/4D_tensor_contraction.cpp b/examples/4D_tensor_contraction.cpp new file mode 100644 index 0000000000..51ffea53a6 --- /dev/null +++ b/examples/4D_tensor_contraction.cpp @@ -0,0 +1,191 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "camp/resource.hpp" +#include "memoryManager.hpp" + + +/* + * RAJA Launch Example: Upper Triangular Pattern + Shared Memory + * + * Launch introduces hierarchical parallelism through the concept of + * teams and threads. Computation is executed in a pre-defined grid + * composed of threads and grouped into teams. The teams model enables + * developers to express parallelism through loops over teams, and inner loops + * over threads. Team loops are executed in parallel and + * threads within a team should be treated as sub-parallel regions. + * + * Team shared memory is allocated between team and thread loops. + * Memory allocated within thread loops are thread private. + * The example below demonstrates composing an upper triangular + * loop pattern, and using shared memory. + * + */ + +// Define problem setup +constexpr int TotalMats = 100; + +constexpr int I = 2; +constexpr int J = 2; +constexpr int L = 2; +constexpr int K = 2; +constexpr int M = 2; +constexpr int N = 2; +constexpr int O = 2; + +/* + * Define host/device launch policies + */ +const bool async = false; +using launch_policy = RAJA::LaunchPolicy +#endif + >; + +using teams = RAJA::LoopPolicy; + +using loop_0 = RAJA::LoopPolicy +#endif + >; +using loop_1 = RAJA::LoopPolicy +#endif + >; +using loop_2 = RAJA::LoopPolicy +#endif + >; +using loop_3 = RAJA::LoopPolicy +#endif + >; + +using loop_4 = RAJA::LoopPolicy +#endif + >; +using loop_5 = RAJA::LoopPolicy +#endif + >; + + +template +void tensor_contraction(AVIEW A, BVIEW B, CVIEW C, RAJA::ExecPlace platform) +{ + + RAJA::launch + (platform, RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, TotalMats), [&](int r) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, I), [&](int i) { + RAJA::loop(ctx, RAJA::RangeSegment(0, J), [&](int j) { + RAJA::loop(ctx, RAJA::RangeSegment(0, K), [&](int k) { + RAJA::loop(ctx, RAJA::RangeSegment(0, M), [&](int m) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&](int n) { + RAJA::loop(ctx, RAJA::RangeSegment(0, O), [&](int o) { + + double dot = 0.0; + for(int l = 0; l < L; ++l) { + dot += A(r, i,j,k,l) * B(r, l,m,n,o); + } + C(r, i,j,k,m,n,o) = dot; + + }); + }); + }); + }); + }); + }); + }); + + }); // outer lambda + + +} + + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + + double *Aptr = memoryManager::allocate(TotalMats * I * J * K * L); + double *Bptr = memoryManager::allocate(TotalMats * L * M * N * O); + double *Cptr = memoryManager::allocate(TotalMats * I * J * K * M * N * O); + + double *test_Cptr = memoryManager::allocate(TotalMats * I * J * K * M * N * O); + + auto A = RAJA::make_permuted_view(Aptr, TotalMats, I, J, K, L); + auto B = RAJA::make_permuted_view(Bptr, TotalMats, L, M, N, O); + auto C = RAJA::make_permuted_view(Cptr, TotalMats, I, J, K, M, N, O); + auto test_C = RAJA::make_permuted_view(test_Cptr, TotalMats, I, J, K, M, N, O); + + // Initialize A and B with some values + for(int mat = 0; mat < TotalMats; ++mat) { + + for (int i = 0; i < I; i++) { + for (int j = 0; j < J; j++) { + for (int k = 0; k < K; k++) { + for (int l = 0; l < L; l++) { + A(mat, i, j, k, l) = 1.0; + } + } + } + } + + for (int l = 0; l < L; l++) { + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) { + for (int o = 0; o < O; o++) { + B(mat, l, m, n, o) = 1.0; + } + } + } + } + + } + + + + tensor_contraction(A, B, C, RAJA::ExecPlace::HOST); + + tensor_contraction(A, B, test_C, RAJA::ExecPlace::DEVICE); + + + //test correctness + double diff = 0.0; + for(int i = 0; i < I * J * K * M * N * O; ++i) { + diff += fabs(Cptr[i] - test_Cptr[i]); + } + + std::cout<<"diff = "< +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "camp/resource.hpp" + + +/* + * RAJA Launch Example: Upper Triangular Pattern + Shared Memory + * + * Launch introduces hierarchical parallelism through the concept of + * teams and threads. Computation is executed in a pre-defined grid + * composed of threads and grouped into teams. The teams model enables + * developers to express parallelism through loops over teams, and inner loops + * over threads. Team loops are executed in parallel and + * threads within a team should be treated as sub-parallel regions. + * + * Team shared memory is allocated between team and thread loops. + * Memory allocated within thread loops are thread private. + * The example below demonstrates composing an upper triangular + * loop pattern, and using shared memory. + * + */ + +/* + * Define host/device launch policies + */ +using launch_policy = RAJA::LaunchPolicy< +#if defined(RAJA_ENABLE_OPENMP) + RAJA::omp_launch_t +#else + RAJA::seq_launch_t +#endif +#if defined(RAJA_ENABLE_CUDA) + , + RAJA::cuda_launch_t +#endif +#if defined(RAJA_ENABLE_HIP) + , + RAJA::hip_launch_t +#endif + >; + +/* + * Define team policies. + * Up to 3 dimension are supported: x,y,z + */ +using teams_x = RAJA::LoopPolicy< +#if defined(RAJA_ENABLE_OPENMP) + RAJA::omp_parallel_for_exec +#else + RAJA::seq_exec +#endif +#if defined(RAJA_ENABLE_CUDA) + , + RAJA::cuda_block_x_direct +#endif +#if defined(RAJA_ENABLE_HIP) + , + RAJA::hip_block_x_direct +#endif + >; +/* + * Define thread policies. + * Up to 3 dimension are supported: x,y,z + */ +using threads_x = RAJA::LoopPolicy; + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + // Resource object for host + camp::resources::Host host_res; + + // Resource objects for CUDA or HIP +#if defined(RAJA_ENABLE_CUDA) + camp::resources::Cuda device_res; +#endif + +#if defined(RAJA_ENABLE_HIP) + camp::resources::Hip device_res; +#endif + + std::cout << "\n Running RAJA-Launch examples...\n"; + int num_of_backends = 1; +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + num_of_backends++; +#endif + + // RAJA teams may switch between host and device policies at run time. + // The loop below will execute through the available backends. + + for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) { + + auto select_cpu_or_gpu = (RAJA::ExecPlace)exec_place; + + // Allocate memory for either host or device + int N_tri = 5; + + int* Ddat = nullptr; + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) { + Ddat = host_res.allocate(N_tri * N_tri); + } + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + Ddat = device_res.allocate(N_tri * N_tri); + } +#endif + + /* + * RAJA::launch just starts a "kernel" and doesn't provide any looping. + * + * The first argument determines which policy should be executed, + * + * The second argument is the number of teams+threads needed for each of the + * policies. + * + * Third argument is the lambda. + * + * The lambda takes a "resource" object, which has the teams+threads + * and is used to perform thread synchronizations within a team. + */ + + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST){ + std::cout << "\n Running upper triangular pattern example on the host...\n"; + } else { + std::cout << "\n Running upper triangular pattern example on the device...\n"; + } + + RAJA::LaunchParams{}; + + //Example of high-dimensional thread-team + RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)); + + RAJA::View> D(Ddat, N_tri, N_tri); + + RAJA::launch + (select_cpu_or_gpu, + RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)), + //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + printf("in kernel \n"); + RAJA::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { + + // Array shared within threads of the same team + RAJA_TEAM_SHARED int s_A[1]; + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](int c) { + s_A[c] = r; + }); // loop c + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) { + D(r, c) = r * N_tri + c; + printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]); + }); // loop c + + }); // loop r + + }); // outer lambda + + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) { + host_res.deallocate(Ddat); + } + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + device_res.deallocate(Ddat); + } +#endif + + } // Execution places loop + + +} // Main diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index bfce94057c..8a580fb801 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -107,10 +107,13 @@ struct Teams constexpr Teams(int i, int j, int k) : value {i, j, k} {} }; +template struct Threads { - int value[3]; + std::array value; + // int value[DIM]; +#if 1 RAJA_INLINE RAJA_HOST_DEVICE @@ -130,35 +133,26 @@ struct Threads RAJA_HOST_DEVICE constexpr Threads(int i, int j, int k) : value {i, j, k} {} -}; - -struct Lanes -{ - int value; - - RAJA_INLINE - - RAJA_HOST_DEVICE - constexpr Lanes() : value(0) {} - RAJA_INLINE - - RAJA_HOST_DEVICE - constexpr Lanes(int i) : value(i) {} + //#else + template + constexpr Threads(Args... args) : value {static_cast(args)...} {}; +#endif }; +template struct LaunchParams { public: Teams teams; - Threads threads; + Threads threads; size_t shared_mem_size; RAJA_INLINE LaunchParams() = default; LaunchParams(Teams in_teams, - Threads in_threads, + Threads in_threads, size_t in_shared_mem_size = 0) : teams(in_teams), threads(in_threads), @@ -170,10 +164,12 @@ struct LaunchParams RAJA_INLINE Teams apply(Teams const& a) { return (teams = a); } + /* RAJA_HOST_DEVICE RAJA_INLINE Threads apply(Threads const& a) { return (threads = a); } + */ }; class LaunchContext @@ -185,6 +181,10 @@ class LaunchContext void* shared_mem_ptr; + //hardcoded for now... + std::array thread_dim; + std::array thread_id; + #if defined(RAJA_ENABLE_SYCL) mutable ::sycl::nd_item<3>* itm; #endif @@ -246,8 +246,8 @@ struct LaunchExecute; // Duplicate of code above on account that we need to support the case in which // a kernel_name is not given -template -void launch(LaunchParams const& launch_params, +template +void launch(LaunchParams const& launch_params, ReduceParams&&... rest_of_launch_args) { // Get reducers @@ -288,17 +288,19 @@ void launch(LaunchParams const& launch_params, //================================================= // Run time based policy launch //================================================= -template -void launch(ExecPlace place, LaunchParams const& params, BODY const& body) +template +void launch(ExecPlace place, + LaunchParams const& params, + BODY const& body) { launch(place, params, body); } // Run-time API for new reducer interface with support of the case without a new // kernel name -template +template void launch(ExecPlace place, - const LaunchParams& launch_params, + LaunchParams const& launch_params, ReduceParams&&... rest_of_launch_args) // BODY const &body) { @@ -367,10 +369,10 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device) // Duplicate of API above on account that we need to handle the case that a // kernel name is not provided -template +template resources::EventProxy launch( RAJA::resources::Resource res, - LaunchParams const& launch_params, + LaunchParams const& launch_params, ReduceParams&&... rest_of_launch_args) { diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index f0b7d0be98..6a54205ded 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -72,13 +72,13 @@ struct LaunchExecute< named_usage::unspecified>> { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -88,6 +88,14 @@ struct LaunchExecute< resources::Cuda cuda_res = res.get(); + + if (params.threads.value.size() > 3) + { + std::cout << "threads container is larger than 3 : " + << params.threads.value.size() << std::endl; + } + + // // Compute the number of blocks and threads // @@ -96,10 +104,32 @@ struct LaunchExecute< static_cast(params.teams.value[1]), static_cast(params.teams.value[2])}; + cuda_dim_t blockSize; + + if (params.threads.value.size() < 4) + { + blockSize = + cuda_dim_t {static_cast(params.threads.value[0]), + static_cast(params.threads.value[1]), + static_cast(params.threads.value[2])}; + } + else + { + + int total_threads = detail::multiplyArray(params.threads.value); + std::cout << "Total threads" << std::endl; + blockSize = cuda_dim_t {static_cast( + detail::multiplyArray(params.threads.value)), + static_cast(1), + static_cast(1)}; + } + + /* cuda_dim_t blockSize { static_cast(params.threads.value[0]), static_cast(params.threads.value[1]), static_cast(params.threads.value[2])}; + */ // Only launch kernel if we have something to iterate over constexpr cuda_dim_member_t zero = 0; @@ -129,19 +159,23 @@ struct LaunchExecute< RAJA_FT_END; } + else + { + std::cout << "did not launch kernel " << std::endl; + } return resources::EventProxy(res); } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams& launch_reducers) { @@ -260,13 +294,13 @@ struct LaunchExecute< RAJA::policy::cuda::cuda_launch_explicit_t> { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -323,14 +357,14 @@ struct LaunchExecute< } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams& launch_reducers) { diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index ab27e9b456..32e5a6595b 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -28,6 +28,25 @@ namespace RAJA { +// internal helper function +namespace detail +{ + +template +constexpr T multiply_impl(const std::array& arr, + std::index_sequence) +{ + return (arr[I] * ...); +} + +template +constexpr T multiplyArray(const std::array& arr) +{ + return multiply_impl(arr, std::make_index_sequence {}); +} + +} // namespace detail + template __global__ void launch_global_fcn(BODY body_in) { @@ -44,6 +63,37 @@ __global__ void launch_global_fcn(BODY body_in) body(ctx); } +template +__global__ void launch_global_fcn_ctx(BODY body_in, LaunchContext ctx) +{ + //LaunchContext ctx; + + //unravel index + int tid = threadIdx.x; + for (int d = ctx.thread_dim.size()-1; d >= 0; --d) { + ctx.thread_id[d] = tid % ctx.thread_dim[d]; + tid /= ctx.thread_dim[d]; + } + + /* + if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0){ + for(int i=0; i __global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params) @@ -70,22 +120,29 @@ struct LaunchExecute< RAJA::policy::hip::hip_launch_t> { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { using BODY = camp::decay; - auto func = reinterpret_cast(&launch_global_fcn); + auto func = reinterpret_cast(&launch_global_fcn_ctx); resources::Hip hip_res = res.get(); + if (params.threads.value.size() > 3) + { + std::cout << "threads container is larger than 3 : " + << params.threads.value.size() << std::endl; + } + + // // Compute the number of blocks and threads // @@ -94,10 +151,33 @@ struct LaunchExecute< static_cast(params.teams.value[1]), static_cast(params.teams.value[2])}; + /* hip_dim_t blockSize { static_cast(params.threads.value[0]), static_cast(params.threads.value[1]), static_cast(params.threads.value[2])}; + */ + + hip_dim_t blockSize; + + if (params.threads.value.size() < 4) + { + blockSize = + hip_dim_t {static_cast(params.threads.value[0]), + static_cast(params.threads.value[1]), + static_cast(params.threads.value[2])}; + } + else + { + + int total_threads = detail::multiplyArray(params.threads.value); + std::cout << "Total threads" << std::endl; + blockSize = hip_dim_t {static_cast( + detail::multiplyArray(params.threads.value)), + static_cast(1), + static_cast(1)}; + } + // Only launch kernel if we have something to iterate over constexpr hip_dim_member_t zero = 0; @@ -109,7 +189,7 @@ struct LaunchExecute< { size_t shared_mem_size = params.shared_mem_size; - + std::cout << "launching kernel " << std::endl; // // Privatize the loop_body, using make_launch_body to setup reductions // @@ -117,29 +197,38 @@ struct LaunchExecute< shared_mem_size, hip_res, std::forward(body_in)); + //Copy threads over + LaunchContext ctx; + ctx.thread_dim = params.threads.value; + // // Launch the kernel // - void* args[] = {(void*)&body}; + void* args[] = {(void*)&body, (void*)&ctx}; RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async); } RAJA_FT_END; } + else + { + + std::cout << "did not launch kernel " << std::endl; + } return resources::EventProxy(res); } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams& launch_reducers) { @@ -255,13 +344,13 @@ template struct LaunchExecute> { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -318,14 +407,14 @@ struct LaunchExecute> } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams& launch_reducers) { @@ -401,6 +490,30 @@ struct LaunchExecute> } }; + +/* + Arbitrary dimension thread indexing +*/ +template +struct hip_loop_dim_exec; + +template +struct LoopExecute, SEGMENT> +{ + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const &ctx, + SEGMENT const& segment, + BODY const& body) + { + const int i = ctx.thread_id[DIM]; + + body(*(segment.begin() + i)); + } +}; + + /* HIP generic loop implementations */ diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp index 2092c87bb3..a7cebc31c9 100644 --- a/include/RAJA/policy/openmp/launch.hpp +++ b/include/RAJA/policy/openmp/launch.hpp @@ -28,13 +28,13 @@ template<> struct LaunchExecute { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - LaunchParams const& params, + LaunchParams const& params, BODY const& body, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -55,14 +55,14 @@ struct LaunchExecute return resources::EventProxy(res); } - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - LaunchParams const& launch_params, + LaunchParams const& launch_params, BODY const& body, ReduceParams& f_params) { diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index ee98804ecf..1026e78af2 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -40,13 +40,13 @@ template<> struct LaunchExecute { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - LaunchParams const& params, + LaunchParams const& params, BODY const& body, ReduceParams& RAJA_UNUSED_ARG(ReduceParams)) { @@ -64,14 +64,14 @@ struct LaunchExecute return resources::EventProxy(res); } - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - LaunchParams const& launch_params, + LaunchParams const& launch_params, BODY const& body, ReduceParams& launch_reducers) { diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp index f69e2c4424..7dacc5105d 100644 --- a/include/RAJA/policy/sycl/launch.hpp +++ b/include/RAJA/policy/sycl/launch.hpp @@ -33,7 +33,8 @@ struct LaunchExecute> { // If the launch lambda is trivially copyable - template {}, bool>::type = true> @@ -42,7 +43,7 @@ struct LaunchExecute> RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -102,7 +103,8 @@ struct LaunchExecute> // If the launch lambda is trivially copyable and we have explcit reduction // parameters - template {}, bool>::type = true> @@ -112,7 +114,7 @@ struct LaunchExecute> concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams launch_reducers) { @@ -194,7 +196,8 @@ struct LaunchExecute> } // If the launch lambda is not trivially copyable - template {}, bool>::type = true> @@ -203,7 +206,7 @@ struct LaunchExecute> RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -269,7 +272,8 @@ struct LaunchExecute> } // If the launch lambda is not trivially copyable - template {}, bool>::type = true> @@ -279,7 +283,7 @@ struct LaunchExecute> concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams launch_reducers) {