diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 39b1a5d7f..7f9ff68d5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -76,6 +76,8 @@ blt_add_executable( apps/MASS3DEA-Seq.cpp apps/MASS3DPA.cpp apps/MASS3DPA-Seq.cpp + apps/MASS3DPA_ATOMIC.cpp + apps/MASS3DPA_ATOMIC-Seq.cpp apps/MASSVEC3DPA.cpp apps/MASSVEC3DPA-Seq.cpp apps/MATVEC_3D_STENCIL.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index d5e3814cd..29e54a32c 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -90,6 +90,12 @@ blt_add_library( MASS3DPA-Seq.cpp MASS3DPA-OMP.cpp MASS3DPA-Sycl.cpp + MASS3DPA_ATOMIC.cpp + MASS3DPA_ATOMIC-Cuda.cpp + MASS3DPA_ATOMIC-Hip.cpp + MASS3DPA_ATOMIC-Seq.cpp + MASS3DPA_ATOMIC-OMP.cpp + MASS3DPA_ATOMIC-Sycl.cpp MASSVEC3DPA.cpp MASSVEC3DPA-Cuda.cpp MASSVEC3DPA-Hip.cpp diff --git a/src/apps/CONVECTION3DPA-Cuda.cpp b/src/apps/CONVECTION3DPA-Cuda.cpp index 3fe85c7c3..c3c55b744 100644 --- a/src/apps/CONVECTION3DPA-Cuda.cpp +++ b/src/apps/CONVECTION3DPA-Cuda.cpp @@ -20,7 +20,7 @@ namespace rajaperf { namespace apps { template < size_t block_size > - __launch_bounds__(block_size) +__launch_bounds__(block_size) __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y) { @@ -29,11 +29,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, CONVECTION3DPA_0_GPU; - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { - GPU_FOREACH_THREAD(dy,y,CPA_D1D) + GPU_FOREACH_THREAD(dy,y,conv::D1D) { - GPU_FOREACH_THREAD(dx,x,CPA_D1D) + GPU_FOREACH_THREAD(dx,x,conv::D1D) { CONVECTION3DPA_1; } @@ -41,11 +41,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { - GPU_FOREACH_THREAD(dy,y,CPA_D1D) + GPU_FOREACH_THREAD(dy,y,conv::D1D) { - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { CONVECTION3DPA_2; } @@ -53,11 +53,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { - GPU_FOREACH_THREAD(qy,y,CPA_Q1D) + GPU_FOREACH_THREAD(qy,y,conv::Q1D) { CONVECTION3DPA_3; } @@ -65,11 +65,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { - GPU_FOREACH_THREAD(qy,y,CPA_Q1D) + GPU_FOREACH_THREAD(qy,y,conv::Q1D) { - GPU_FOREACH_THREAD(qz,z,CPA_Q1D) + GPU_FOREACH_THREAD(qz,z,conv::Q1D) { CONVECTION3DPA_4; } @@ -77,11 +77,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(qz,z,CPA_Q1D) + GPU_FOREACH_THREAD(qz,z,conv::Q1D) { - GPU_FOREACH_THREAD(qy,y,CPA_Q1D) + GPU_FOREACH_THREAD(qy,y,conv::Q1D) { - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { CONVECTION3DPA_5; } @@ -89,11 +89,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { - GPU_FOREACH_THREAD(qy,y,CPA_Q1D) + GPU_FOREACH_THREAD(qy,y,conv::Q1D) { - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { CONVECTION3DPA_6; } @@ -101,11 +101,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { - GPU_FOREACH_THREAD(dy,y,CPA_D1D) + GPU_FOREACH_THREAD(dy,y,conv::D1D) { CONVECTION3DPA_7; } @@ -113,11 +113,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { - GPU_FOREACH_THREAD(dy,y,CPA_D1D) + GPU_FOREACH_THREAD(dy,y,conv::D1D) { - GPU_FOREACH_THREAD(dx,x,CPA_D1D) + GPU_FOREACH_THREAD(dx,x,conv::D1D) { CONVECTION3DPA_8; } @@ -144,7 +144,7 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D); + dim3 nthreads_per_block(conv::Q1D, conv::Q1D, conv::Q1D); constexpr size_t shmem = 0; RPlaunchCudaKernel( (Convection3DPA), @@ -162,27 +162,28 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { constexpr bool async = true; using launch_policy = - RAJA::LaunchPolicy>; + RAJA::LaunchPolicy>; using outer_x = RAJA::LoopPolicy; using inner_x = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using inner_y = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using inner_z = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(CPA_Q1D, CPA_Q1D, CPA_Q1D)), + RAJA::Threads(conv::Q1D, conv::Q1D, conv::Q1D)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), @@ -190,11 +191,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { CONVECTION3DPA_0_GPU; - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dx) { CONVECTION3DPA_1; @@ -208,11 +209,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { CONVECTION3DPA_2; @@ -226,11 +227,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { CONVECTION3DPA_3; @@ -244,11 +245,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qz) { CONVECTION3DPA_4; @@ -262,11 +263,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { CONVECTION3DPA_5; @@ -280,11 +281,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { CONVECTION3DPA_6; @@ -298,11 +299,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { CONVECTION3DPA_7; @@ -316,11 +317,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dx) { CONVECTION3DPA_8; @@ -337,6 +338,7 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/CONVECTION3DPA-Hip.cpp b/src/apps/CONVECTION3DPA-Hip.cpp index 2edcc8bb0..ce0a5c5c4 100644 --- a/src/apps/CONVECTION3DPA-Hip.cpp +++ b/src/apps/CONVECTION3DPA-Hip.cpp @@ -29,11 +29,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, CONVECTION3DPA_0_GPU; - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { - GPU_FOREACH_THREAD(dy,y,CPA_D1D) + GPU_FOREACH_THREAD(dy,y,conv::D1D) { - GPU_FOREACH_THREAD(dx,x,CPA_D1D) + GPU_FOREACH_THREAD(dx,x,conv::D1D) { CONVECTION3DPA_1; } @@ -41,11 +41,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { - GPU_FOREACH_THREAD(dy,y,CPA_D1D) + GPU_FOREACH_THREAD(dy,y,conv::D1D) { - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { CONVECTION3DPA_2; } @@ -53,11 +53,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { - GPU_FOREACH_THREAD(qy,y,CPA_Q1D) + GPU_FOREACH_THREAD(qy,y,conv::Q1D) { CONVECTION3DPA_3; } @@ -65,11 +65,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { - GPU_FOREACH_THREAD(qy,y,CPA_Q1D) + GPU_FOREACH_THREAD(qy,y,conv::Q1D) { - GPU_FOREACH_THREAD(qz,z,CPA_Q1D) + GPU_FOREACH_THREAD(qz,z,conv::Q1D) { CONVECTION3DPA_4; } @@ -77,11 +77,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(qz,z,CPA_Q1D) + GPU_FOREACH_THREAD(qz,z,conv::Q1D) { - GPU_FOREACH_THREAD(qy,y,CPA_Q1D) + GPU_FOREACH_THREAD(qy,y,conv::Q1D) { - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { CONVECTION3DPA_5; } @@ -89,11 +89,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { - GPU_FOREACH_THREAD(qy,y,CPA_Q1D) + GPU_FOREACH_THREAD(qy,y,conv::Q1D) { - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { CONVECTION3DPA_6; } @@ -101,11 +101,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { - GPU_FOREACH_THREAD(qx,x,CPA_Q1D) + GPU_FOREACH_THREAD(qx,x,conv::Q1D) { - GPU_FOREACH_THREAD(dy,y,CPA_D1D) + GPU_FOREACH_THREAD(dy,y,conv::D1D) { CONVECTION3DPA_7; } @@ -113,11 +113,11 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis, } __syncthreads(); - GPU_FOREACH_THREAD(dz,z,CPA_D1D) + GPU_FOREACH_THREAD(dz,z,conv::D1D) { - GPU_FOREACH_THREAD(dy,y,CPA_D1D) + GPU_FOREACH_THREAD(dy,y,conv::D1D) { - GPU_FOREACH_THREAD(dx,x,CPA_D1D) + GPU_FOREACH_THREAD(dx,x,conv::D1D) { CONVECTION3DPA_8; } @@ -144,7 +144,7 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D); + dim3 nthreads_per_block(conv::Q1D, conv::Q1D, conv::Q1D); constexpr size_t shmem = 0; RPlaunchHipKernel( (Convection3DPA), @@ -162,27 +162,28 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { constexpr bool async = true; using launch_policy = - RAJA::LaunchPolicy>; + RAJA::LaunchPolicy>; using outer_x = RAJA::LoopPolicy; using inner_x = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using inner_y = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using inner_z = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(CPA_Q1D, CPA_Q1D, CPA_Q1D)), + RAJA::Threads(conv::Q1D, conv::Q1D, conv::Q1D)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), @@ -190,11 +191,11 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { CONVECTION3DPA_0_GPU; - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dx) { CONVECTION3DPA_1; @@ -208,11 +209,11 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { CONVECTION3DPA_2; @@ -226,11 +227,11 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { CONVECTION3DPA_3; @@ -244,11 +245,11 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qz) { CONVECTION3DPA_4; @@ -262,11 +263,11 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { CONVECTION3DPA_5; @@ -280,11 +281,11 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { CONVECTION3DPA_6; @@ -298,11 +299,11 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { CONVECTION3DPA_7; @@ -316,11 +317,11 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dx) { CONVECTION3DPA_8; @@ -337,6 +338,7 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/CONVECTION3DPA-OMP.cpp b/src/apps/CONVECTION3DPA-OMP.cpp index 7a3ae9a6e..f96940d33 100644 --- a/src/apps/CONVECTION3DPA-OMP.cpp +++ b/src/apps/CONVECTION3DPA-OMP.cpp @@ -35,88 +35,88 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { CONVECTION3DPA_0_CPU; - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { - CPU_FOREACH(dy,y,CPA_D1D) + CPU_FOREACH(dy,y,conv::D1D) { - CPU_FOREACH(dx,x,CPA_D1D) + CPU_FOREACH(dx,x,conv::D1D) { CONVECTION3DPA_1; } } } - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { - CPU_FOREACH(dy,y,CPA_D1D) + CPU_FOREACH(dy,y,conv::D1D) { - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { CONVECTION3DPA_2; } } } - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { - CPU_FOREACH(qy,y,CPA_Q1D) + CPU_FOREACH(qy,y,conv::Q1D) { CONVECTION3DPA_3; } } } - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { - CPU_FOREACH(qy,y,CPA_Q1D) + CPU_FOREACH(qy,y,conv::Q1D) { - CPU_FOREACH(qz,z,CPA_Q1D) + CPU_FOREACH(qz,z,conv::Q1D) { CONVECTION3DPA_4; } } } - CPU_FOREACH(qz,z,CPA_Q1D) + CPU_FOREACH(qz,z,conv::Q1D) { - CPU_FOREACH(qy,y,CPA_Q1D) + CPU_FOREACH(qy,y,conv::Q1D) { - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { CONVECTION3DPA_5; } } } - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { - CPU_FOREACH(qy,y,CPA_Q1D) + CPU_FOREACH(qy,y,conv::Q1D) { - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { CONVECTION3DPA_6; } } } - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { - CPU_FOREACH(dy,y,CPA_D1D) + CPU_FOREACH(dy,y,conv::D1D) { CONVECTION3DPA_7; } } } - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { - CPU_FOREACH(dy,y,CPA_D1D) + CPU_FOREACH(dy,y,conv::D1D) { - CPU_FOREACH(dx,x,CPA_D1D) + CPU_FOREACH(dx,x,conv::D1D) { CONVECTION3DPA_8; } @@ -149,6 +149,7 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { // Grid is empty as the host does not need a compute grid to be specified + //clang-format off RAJA::launch( res, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -158,11 +159,11 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { CONVECTION3DPA_0_CPU; - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dx) { CONVECTION3DPA_1; @@ -176,11 +177,11 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { CONVECTION3DPA_2; @@ -194,11 +195,11 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { CONVECTION3DPA_3; @@ -212,11 +213,11 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qz) { CONVECTION3DPA_4; @@ -230,11 +231,11 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { CONVECTION3DPA_5; @@ -248,11 +249,11 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { CONVECTION3DPA_6; @@ -266,11 +267,11 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { CONVECTION3DPA_7; @@ -284,11 +285,11 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dx) { CONVECTION3DPA_8; @@ -305,6 +306,8 @@ void CONVECTION3DPA::runOpenMPVariant(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on + } // loop over kernel reps stopTimer(); diff --git a/src/apps/CONVECTION3DPA-Seq.cpp b/src/apps/CONVECTION3DPA-Seq.cpp index 842ebc128..b40727309 100644 --- a/src/apps/CONVECTION3DPA-Seq.cpp +++ b/src/apps/CONVECTION3DPA-Seq.cpp @@ -32,88 +32,88 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { CONVECTION3DPA_0_CPU; - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { - CPU_FOREACH(dy,y,CPA_D1D) + CPU_FOREACH(dy,y,conv::D1D) { - CPU_FOREACH(dx,x,CPA_D1D) + CPU_FOREACH(dx,x,conv::D1D) { CONVECTION3DPA_1; } } } - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { - CPU_FOREACH(dy,y,CPA_D1D) + CPU_FOREACH(dy,y,conv::D1D) { - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { CONVECTION3DPA_2; } } } - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { - CPU_FOREACH(qy,y,CPA_Q1D) + CPU_FOREACH(qy,y,conv::Q1D) { CONVECTION3DPA_3; } } } - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { - CPU_FOREACH(qy,y,CPA_Q1D) + CPU_FOREACH(qy,y,conv::Q1D) { - CPU_FOREACH(qz,z,CPA_Q1D) + CPU_FOREACH(qz,z,conv::Q1D) { CONVECTION3DPA_4; } } } - CPU_FOREACH(qz,z,CPA_Q1D) + CPU_FOREACH(qz,z,conv::Q1D) { - CPU_FOREACH(qy,y,CPA_Q1D) + CPU_FOREACH(qy,y,conv::Q1D) { - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { CONVECTION3DPA_5; } } } - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { - CPU_FOREACH(qy,y,CPA_Q1D) + CPU_FOREACH(qy,y,conv::Q1D) { - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { CONVECTION3DPA_6; } } } - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { - CPU_FOREACH(qx,x,CPA_Q1D) + CPU_FOREACH(qx,x,conv::Q1D) { - CPU_FOREACH(dy,y,CPA_D1D) + CPU_FOREACH(dy,y,conv::D1D) { CONVECTION3DPA_7; } } } - CPU_FOREACH(dz,z,CPA_D1D) + CPU_FOREACH(dz,z,conv::D1D) { - CPU_FOREACH(dy,y,CPA_D1D) + CPU_FOREACH(dy,y,conv::D1D) { - CPU_FOREACH(dx,x,CPA_D1D) + CPU_FOREACH(dx,x,conv::D1D) { CONVECTION3DPA_8; } @@ -147,6 +147,7 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { // Grid is empty as the host does not need a compute grid to be specified + //clang-format off RAJA::launch( res, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -156,11 +157,11 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { CONVECTION3DPA_0_CPU; - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dx) { CONVECTION3DPA_1; @@ -174,11 +175,11 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { CONVECTION3DPA_2; @@ -192,11 +193,11 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { CONVECTION3DPA_3; @@ -210,11 +211,11 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qz) { CONVECTION3DPA_4; @@ -228,11 +229,11 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { CONVECTION3DPA_5; @@ -246,11 +247,11 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { CONVECTION3DPA_6; @@ -264,11 +265,11 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { CONVECTION3DPA_7; @@ -282,11 +283,11 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dx) { CONVECTION3DPA_8; @@ -303,6 +304,7 @@ void CONVECTION3DPA::runSeqVariant(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format off } // loop over kernel reps stopTimer(); diff --git a/src/apps/CONVECTION3DPA-Sycl.cpp b/src/apps/CONVECTION3DPA-Sycl.cpp index 9c65e9fcc..c2e3f467e 100644 --- a/src/apps/CONVECTION3DPA-Sycl.cpp +++ b/src/apps/CONVECTION3DPA-Sycl.cpp @@ -30,8 +30,8 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { CONVECTION3DPA_DATA_SETUP; - const ::sycl::range<3> workGroupSize(CPA_Q1D, CPA_Q1D, CPA_Q1D); - const ::sycl::range<3> gridSize(CPA_Q1D,CPA_Q1D,CPA_Q1D*NE); + const ::sycl::range<3> workGroupSize(conv::Q1D, conv::Q1D, conv::Q1D); + const ::sycl::range<3> gridSize(conv::Q1D,conv::Q1D,conv::Q1D*NE); constexpr size_t shmem = 0; @@ -45,8 +45,8 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { qu->submit([&](::sycl::handler& h) { - constexpr Index_type max_D1D = CPA_D1D; - constexpr Index_type max_Q1D = CPA_Q1D; + constexpr Index_type max_D1D = conv::D1D; + constexpr Index_type max_Q1D = conv::Q1D; constexpr Index_type max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; auto sm0_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); @@ -82,11 +82,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { Real_type (*BDGu)[max_Q1D][max_Q1D] = (Real_type (*)[max_Q1D][max_Q1D])sm4; Real_type (*BBDGu)[max_D1D][max_Q1D] = (Real_type (*)[max_D1D][max_Q1D])sm5; - SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,conv::D1D) { - SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + SYCL_FOREACH_THREAD(dy,1,conv::D1D) { - SYCL_FOREACH_THREAD(dx,2,CPA_D1D) + SYCL_FOREACH_THREAD(dx,2,conv::D1D) { CONVECTION3DPA_1; } @@ -94,11 +94,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,conv::D1D) { - SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + SYCL_FOREACH_THREAD(dy,1,conv::D1D) { - SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,conv::Q1D) { CONVECTION3DPA_2; } @@ -106,11 +106,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,conv::D1D) { - SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,conv::Q1D) { - SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + SYCL_FOREACH_THREAD(qy,1,conv::Q1D) { CONVECTION3DPA_3; } @@ -118,11 +118,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,conv::Q1D) { - SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + SYCL_FOREACH_THREAD(qy,1,conv::Q1D) { - SYCL_FOREACH_THREAD(qz,0,CPA_Q1D) + SYCL_FOREACH_THREAD(qz,0,conv::Q1D) { CONVECTION3DPA_4; } @@ -130,11 +130,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qz,0,CPA_Q1D) + SYCL_FOREACH_THREAD(qz,0,conv::Q1D) { - SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + SYCL_FOREACH_THREAD(qy,1,conv::Q1D) { - SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,conv::Q1D) { CONVECTION3DPA_5; } @@ -142,11 +142,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,conv::Q1D) { - SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + SYCL_FOREACH_THREAD(qy,1,conv::Q1D) { - SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,conv::D1D) { CONVECTION3DPA_6; } @@ -154,11 +154,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,conv::D1D) { - SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,conv::Q1D) { - SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + SYCL_FOREACH_THREAD(dy,1,conv::D1D) { CONVECTION3DPA_7; } @@ -166,11 +166,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz,0,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,conv::D1D) { - SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + SYCL_FOREACH_THREAD(dy,1,conv::D1D) { - SYCL_FOREACH_THREAD(dx,2,CPA_D1D) + SYCL_FOREACH_THREAD(dx,2,conv::D1D) { CONVECTION3DPA_8; } @@ -209,8 +209,8 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { //Caclulate amount of shared memory needed size_t shmem = 0; { - constexpr Index_type max_D1D = CPA_D1D; - constexpr Index_type max_Q1D = CPA_Q1D; + constexpr Index_type max_D1D = conv::D1D; + constexpr Index_type max_Q1D = conv::Q1D; constexpr Index_type max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; constexpr Index_type no_mats = 6; @@ -221,17 +221,18 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(CPA_Q1D, CPA_Q1D, CPA_Q1D), shmem), + RAJA::Threads(conv::Q1D, conv::Q1D, conv::Q1D), shmem), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), [&](Index_type e) { //Redefine inside the lambda to keep consistent with base version - constexpr Index_type max_D1D = CPA_D1D; - constexpr Index_type max_Q1D = CPA_Q1D; + constexpr Index_type max_D1D = conv::D1D; + constexpr Index_type max_Q1D = conv::Q1D; constexpr Index_type max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; Real_ptr sm0 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); @@ -254,11 +255,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { Real_type (*BDGu)[max_Q1D][max_Q1D] = (Real_type (*)[max_Q1D][max_Q1D])sm4; Real_type (*BBDGu)[max_D1D][max_Q1D] = (Real_type (*)[max_D1D][max_Q1D])sm5; - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dx) { CONVECTION3DPA_1; @@ -272,11 +273,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { CONVECTION3DPA_2; @@ -290,11 +291,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { CONVECTION3DPA_3; @@ -308,11 +309,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qz) { CONVECTION3DPA_4; @@ -326,11 +327,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { CONVECTION3DPA_5; @@ -344,11 +345,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { CONVECTION3DPA_6; @@ -362,11 +363,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::Q1D), [&](Index_type qx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { CONVECTION3DPA_7; @@ -380,11 +381,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, conv::D1D), [&](Index_type dx) { CONVECTION3DPA_8; @@ -401,6 +402,7 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index c92db5004..2b9d91a75 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -25,31 +25,32 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params) { m_NE_default = 15625; - setDefaultProblemSize(m_NE_default*CPA_Q1D*CPA_Q1D*CPA_Q1D); + setDefaultProblemSize(m_NE_default*conv::D1D*conv::D1D*conv::D1D); setDefaultReps(50); - m_NE = std::max((getTargetProblemSize() + (CPA_Q1D*CPA_Q1D*CPA_Q1D)/2) / (CPA_Q1D*CPA_Q1D*CPA_Q1D), Index_type(1)); + //Define problem size in terms of DOFS + m_NE = std::max((getTargetProblemSize() + (conv::D1D*conv::D1D*conv::D1D)/2) / (conv::D1D*conv::D1D*conv::D1D), Index_type(1)); - setActualProblemSize( m_NE*CPA_Q1D*CPA_Q1D*CPA_Q1D ); + setActualProblemSize( m_NE*conv::D1D*conv::D1D*conv::D1D ); - setItsPerRep( m_NE*CPA_Q1D*CPA_Q1D*CPA_Q1D ); + setItsPerRep( m_NE*conv::D1D*conv::D1D*conv::D1D ); setKernelsPerRep(1); - setBytesReadPerRep( 3*sizeof(Real_type) * CPA_Q1D*CPA_D1D + // b, bt, g - 1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE + // x - CPA_VDIM*sizeof(Real_type) * CPA_Q1D*CPA_Q1D*CPA_Q1D*m_NE ); // d + setBytesReadPerRep( 3*sizeof(Real_type) * conv::Q1D*conv::D1D + // b, bt, g + 1*sizeof(Real_type) * conv::D1D*conv::D1D*conv::D1D*m_NE + // x + conv::VDIM*sizeof(Real_type) * conv::Q1D*conv::Q1D*conv::Q1D*m_NE ); // d setBytesWrittenPerRep( 0 ); - setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE ); // y + setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * conv::D1D*conv::D1D*conv::D1D*m_NE ); // y setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_NE * ( - 4 * CPA_D1D * CPA_Q1D * CPA_D1D * CPA_D1D + //2 - 6 * CPA_D1D * CPA_Q1D * CPA_Q1D * CPA_D1D + //3 - 6 * CPA_D1D * CPA_Q1D * CPA_Q1D * CPA_Q1D + //4 - 5 * CPA_Q1D * CPA_Q1D * CPA_Q1D + // 5 - 2 * CPA_Q1D * CPA_D1D * CPA_Q1D * CPA_Q1D + // 6 - 2 * CPA_Q1D * CPA_D1D * CPA_Q1D * CPA_D1D + // 7 - (1 + 2*CPA_Q1D) * CPA_D1D * CPA_D1D * CPA_D1D // 8 + 4 * conv::D1D * conv::Q1D * conv::D1D * conv::D1D + //2 + 6 * conv::D1D * conv::Q1D * conv::Q1D * conv::D1D + //3 + 6 * conv::D1D * conv::Q1D * conv::Q1D * conv::Q1D + //4 + 5 * conv::Q1D * conv::Q1D * conv::Q1D + // 5 + 2 * conv::Q1D * conv::D1D * conv::Q1D * conv::Q1D + // 6 + 2 * conv::Q1D * conv::D1D * conv::Q1D * conv::D1D + // 7 + (1 + 2*conv::Q1D) * conv::D1D * conv::D1D * conv::D1D // 8 )); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); @@ -69,17 +70,17 @@ CONVECTION3DPA::~CONVECTION3DPA() void CONVECTION3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - allocAndInitDataConst(m_B, Index_type(CPA_Q1D*CPA_D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_Bt, Index_type(CPA_Q1D*CPA_D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_G, Index_type(CPA_Q1D*CPA_D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_D, Index_type(CPA_Q1D*CPA_Q1D*CPA_Q1D*CPA_VDIM*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_X, Index_type(CPA_D1D*CPA_D1D*CPA_D1D*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_Y, Index_type(CPA_D1D*CPA_D1D*CPA_D1D*m_NE), Real_type(0.0), vid); + allocAndInitDataConst(m_B, Index_type(conv::Q1D*conv::D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_Bt, Index_type(conv::Q1D*conv::D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_G, Index_type(conv::Q1D*conv::D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_D, Index_type(conv::Q1D*conv::Q1D*conv::Q1D*conv::VDIM*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_X, Index_type(conv::D1D*conv::D1D*conv::D1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_Y, Index_type(conv::D1D*conv::D1D*conv::D1D*m_NE), Real_type(0.0), vid); } void CONVECTION3DPA::updateChecksum(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - addToChecksum(m_Y, CPA_D1D*CPA_D1D*CPA_D1D*m_NE, vid); + addToChecksum(m_Y, conv::D1D*conv::D1D*conv::D1D*m_NE, vid); } void CONVECTION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index f495f8b4f..f3d5845ff 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -10,49 +10,49 @@ /// Action of 3D diffusion matrix via partial assembly /// /// Based on MFEM's/CEED algorithms. -/// Reference implementation -/// https://github.com/mfem/mfem/blob/master/fem/bilininteg_convection_pa.cpp +/// Reference implementation - MFEM-v4.9 +/// https://github.com/mfem/mfem/blob/v4.9/fem/integ/bilininteg_convection_kernels.hpp /// /// -/// for(int e = 0; e < NE; ++e) { +/// for(Index_type e = 0; e < NE; ++e) { /// -/// constexpr int max_D1D = CPA_D1D; -/// constexpr int max_Q1D = CPA_Q1D; -/// constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; -/// MFEM_SHARED double sm0[max_DQ*max_DQ*max_DQ]; -/// MFEM_SHARED double sm1[max_DQ*max_DQ*max_DQ]; -/// MFEM_SHARED double sm2[max_DQ*max_DQ*max_DQ]; -/// MFEM_SHARED double sm3[max_DQ*max_DQ*max_DQ]; -/// MFEM_SHARED double sm4[max_DQ*max_DQ*max_DQ]; -/// MFEM_SHARED double sm5[max_DQ*max_DQ*max_DQ]; +/// constexpr Index_type max_D1D = CPA_D1D; +/// constexpr Index_type max_Q1D = CPA_Q1D; +/// constexpr Index_type max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; +/// MFEM_SHARED Real_type sm0[max_DQ*max_DQ*max_DQ]; +/// MFEM_SHARED Real_type sm1[max_DQ*max_DQ*max_DQ]; +/// MFEM_SHARED Real_type sm2[max_DQ*max_DQ*max_DQ]; +/// MFEM_SHARED Real_type sm3[max_DQ*max_DQ*max_DQ]; +/// MFEM_SHARED Real_type sm4[max_DQ*max_DQ*max_DQ]; +/// MFEM_SHARED Real_type sm5[max_DQ*max_DQ*max_DQ]; /// -/// double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; -/// for(int dz = 0; dz < CPA_D1D; ++dz) +/// Real_type (*u)[max_D1D][max_D1D] = (Real_type (*)[max_D1D][max_D1D]) sm0; +/// for(Index_type dz = 0; dz < CPA_D1D; ++dz) /// { -/// for(int dy = 0; dy < CPA_D1D; ++dy) +/// for(Index_type dy = 0; dy < CPA_D1D; ++dy) /// { -/// for(int dx = 0; dx < CPA_D1D; ++dx) +/// for(Index_type dx = 0; dx < CPA_D1D; ++dx) /// { /// u[dz][dy][dx] = CPA_X(dx,dy,dz,e); /// } /// } /// } /// MFEM_SYNC_THREAD; -/// double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; -/// double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; -/// for(int dz = 0; dz < CPA_D1D; ++dz) +/// Real_type (*Bu)[max_D1D][max_Q1D] = (Real_type (*)[max_D1D][max_Q1D])sm1; +/// Real_type (*Gu)[max_D1D][max_Q1D] = (Real_type (*)[max_D1D][max_Q1D])sm2; +/// for(Index_type dz = 0; dz < CPA_D1D; ++dz) /// { -/// for(int dy = 0; dy < CPA_D1D; ++dy) +/// for(Index_type dy = 0; dy < CPA_D1D; ++dy) /// { -/// for(int qx = 0; qx < CPA_Q1D; ++qx) +/// for(Index_type qx = 0; qx < CPA_Q1D; ++qx) /// { -/// double Bu_ = 0.0; -/// double Gu_ = 0.0; -/// for(int dx = 0; dx < CPA_D1D; ++dx) +/// Real_type Bu_ = 0.0; +/// Real_type Gu_ = 0.0; +/// for(Index_type dx = 0; dx < CPA_D1D; ++dx) /// { -/// const double bx = CPA_B(qx,dx); -/// const double gx = CPA_G(qx,dx); -/// const double x = u[dz][dy][dx]; +/// const Real_type bx = CPA_B(qx,dx); +/// const Real_type gx = CPA_G(qx,dx); +/// const Real_type x = u[dz][dy][dx]; /// Bu_ += bx * x; /// Gu_ += gx * x; /// } @@ -62,22 +62,22 @@ /// } /// } /// MFEM_SYNC_THREAD; -/// double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; -/// double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; -/// double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; -/// for(int dz = 0; dz < CPA_D1D; ++dz) +/// Real_type (*BBu)[max_Q1D][max_Q1D] = (Real_type (*)[max_Q1D][max_Q1D])sm3; +/// Real_type (*GBu)[max_Q1D][max_Q1D] = (Real_type (*)[max_Q1D][max_Q1D])sm4; +/// Real_type (*BGu)[max_Q1D][max_Q1D] = (Real_type (*)[max_Q1D][max_Q1D])sm5; +/// for(Index_type dz = 0; dz < CPA_D1D; ++dz) /// { -/// for(int qx = 0; qx < CPA_Q1D; ++qx) +/// for(Index_type qx = 0; qx < CPA_Q1D; ++qx) /// { -/// for(int qy = 0; qy < CPA_Q1D; ++qy) +/// for(Index_type qy = 0; qy < CPA_Q1D; ++qy) /// { -/// double BBu_ = 0.0; -/// double GBu_ = 0.0; -/// double BGu_ = 0.0; -/// for(int dy = 0; dy < CPA_D1D; ++dy) +/// Real_type BBu_ = 0.0; +/// Real_type GBu_ = 0.0; +/// Real_type BGu_ = 0.0; +/// for(Index_type dy = 0; dy < CPA_D1D; ++dy) /// { -/// const double bx = CPA_B(qy,dy); -/// const double gx = CPA_G(qy,dy); +/// const Real_type bx = CPA_B(qy,dy); +/// const Real_type gx = CPA_G(qy,dy); /// BBu_ += bx * Bu[dz][dy][qx]; /// GBu_ += gx * Bu[dz][dy][qx]; /// BGu_ += bx * Gu[dz][dy][qx]; @@ -89,22 +89,23 @@ /// } /// } /// MFEM_SYNC_THREAD; -/// double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; -/// double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; -/// double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; -/// for(int qx = 0; qx < CPA_Q1D; ++qx) +/// Real_type (*GBBu)[max_Q1D][max_Q1D] = (Real_type (*)[max_Q1D][max_Q1D])sm0; +/// Real_type (*BGBu)[max_Q1D][max_Q1D] = (Real_type (*)[max_Q1D][max_Q1D])sm1; +/// Real_type (*BBGu)[max_Q1D][max_Q1D] = (Real_type (*)[max_Q1D][max_Q1D])sm2; +/// +/// for(Index_type qx = 0; qx max_D1D) ? max_Q1D : max_D1D; \ - RAJA_TEAM_SHARED double sm0[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm1[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm2[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm3[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm4[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm5[max_DQ*max_DQ*max_DQ]; \ - double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; \ - double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; \ - double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; \ - double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; \ - double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; \ - double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; \ - double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; \ - double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; +#define CPA_B(x, y) Basis[x + conv::Q1D * y] +#define CPA_Bt(x, y) tBasis[x + conv::D1D * y] +#define CPA_G(x, y) dBasis[x + conv::Q1D * y] +#define CPA_X(dx, dy, dz, e) \ + X[dx + conv::D1D * dy + conv::D1D * conv::D1D * dz + \ + conv::D1D * conv::D1D * conv::D1D * e] +#define CPA_Y(dx, dy, dz, e) \ + Y[dx + conv::D1D * dy + conv::D1D * conv::D1D * dz + \ + conv::D1D * conv::D1D * conv::D1D * e] +#define CPA_op(qx, qy, qz, d, e) \ + D[qx + conv::Q1D * qy + conv::Q1D * conv::Q1D * qz + \ + conv::Q1D * conv::Q1D * conv::Q1D * d + \ + conv::VDIM * conv::Q1D * conv::Q1D * conv::Q1D * e] +#define CONVECTION3DPA_0_GPU \ + constexpr Index_type max_D1D = conv::D1D; \ + constexpr Index_type max_Q1D = conv::Q1D; \ + constexpr Index_type max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ + RAJA_TEAM_SHARED Real_type sm0[max_DQ * max_DQ * max_DQ]; \ + RAJA_TEAM_SHARED Real_type sm1[max_DQ * max_DQ * max_DQ]; \ + RAJA_TEAM_SHARED Real_type sm2[max_DQ * max_DQ * max_DQ]; \ + RAJA_TEAM_SHARED Real_type sm3[max_DQ * max_DQ * max_DQ]; \ + RAJA_TEAM_SHARED Real_type sm4[max_DQ * max_DQ * max_DQ]; \ + RAJA_TEAM_SHARED Real_type sm5[max_DQ * max_DQ * max_DQ]; \ + Real_type(*u)[max_D1D][max_D1D] = (Real_type(*)[max_D1D][max_D1D])sm0; \ + Real_type(*Bu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm1; \ + Real_type(*Gu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm2; \ + Real_type(*BBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm3; \ + Real_type(*GBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm4; \ + Real_type(*BGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm5; \ + Real_type(*GBBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm0; \ + Real_type(*BGBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm1; \ + Real_type(*BBGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm2; \ + Real_type(*DGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm3; \ + Real_type(*BDGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm4; \ + Real_type(*BBDGu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm5; -#define CONVECTION3DPA_0_CPU \ - constexpr int max_D1D = CPA_D1D; \ - constexpr int max_Q1D = CPA_Q1D; \ - constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ - double sm0[max_DQ*max_DQ*max_DQ]; \ - double sm1[max_DQ*max_DQ*max_DQ]; \ - double sm2[max_DQ*max_DQ*max_DQ]; \ - double sm3[max_DQ*max_DQ*max_DQ]; \ - double sm4[max_DQ*max_DQ*max_DQ]; \ - double sm5[max_DQ*max_DQ*max_DQ]; \ - double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; \ - double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; \ - double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; \ - double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; \ - double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; \ - double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; \ - double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; \ - double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; +#define CONVECTION3DPA_0_CPU \ + constexpr Index_type max_D1D = conv::D1D; \ + constexpr Index_type max_Q1D = conv::Q1D; \ + constexpr Index_type max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ + Real_type sm0[max_DQ * max_DQ * max_DQ]; \ + Real_type sm1[max_DQ * max_DQ * max_DQ]; \ + Real_type sm2[max_DQ * max_DQ * max_DQ]; \ + Real_type sm3[max_DQ * max_DQ * max_DQ]; \ + Real_type sm4[max_DQ * max_DQ * max_DQ]; \ + Real_type sm5[max_DQ * max_DQ * max_DQ]; \ + Real_type(*u)[max_D1D][max_D1D] = (Real_type(*)[max_D1D][max_D1D])sm0; \ + Real_type(*Bu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm1; \ + Real_type(*Gu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm2; \ + Real_type(*BBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm3; \ + Real_type(*GBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm4; \ + Real_type(*BGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm5; \ + Real_type(*GBBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm0; \ + Real_type(*BGBu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm1; \ + Real_type(*BBGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm2; \ + Real_type(*DGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm3; \ + Real_type(*BDGu)[max_Q1D][max_Q1D] = (Real_type(*)[max_Q1D][max_Q1D])sm4; \ + Real_type(*BBDGu)[max_D1D][max_Q1D] = (Real_type(*)[max_D1D][max_Q1D])sm5; -#define CONVECTION3DPA_1 \ - u[dz][dy][dx] = CPA_X(dx,dy,dz,e); +#define CONVECTION3DPA_1 u[dz][dy][dx] = CPA_X(dx, dy, dz, e); -#define CONVECTION3DPA_2 \ - Real_type Bu_ = 0.0; \ - Real_type Gu_ = 0.0; \ - for (Index_type dx = 0; dx < CPA_D1D; ++dx) \ - { \ - const Real_type bx = CPA_B(qx,dx); \ - const Real_type gx = CPA_G(qx,dx); \ - const Real_type x = u[dz][dy][dx]; \ - Bu_ += bx * x; \ - Gu_ += gx * x; \ - } \ - Bu[dz][dy][qx] = Bu_; \ +#define CONVECTION3DPA_2 \ + Real_type Bu_ = 0.0; \ + Real_type Gu_ = 0.0; \ + for (Index_type dx = 0; dx < conv::D1D; ++dx) { \ + const Real_type bx = CPA_B(qx, dx); \ + const Real_type gx = CPA_G(qx, dx); \ + const Real_type x = u[dz][dy][dx]; \ + Bu_ += bx * x; \ + Gu_ += gx * x; \ + } \ + Bu[dz][dy][qx] = Bu_; \ Gu[dz][dy][qx] = Gu_; -#define CONVECTION3DPA_3 \ - Real_type BBu_ = 0.0; \ - Real_type GBu_ = 0.0; \ - Real_type BGu_ = 0.0; \ - for (Index_type dy = 0; dy < CPA_D1D; ++dy) \ - { \ - const Real_type bx = CPA_B(qy,dy); \ - const Real_type gx = CPA_G(qy,dy); \ - BBu_ += bx * Bu[dz][dy][qx]; \ - GBu_ += gx * Bu[dz][dy][qx]; \ - BGu_ += bx * Gu[dz][dy][qx]; \ - } \ - BBu[dz][qy][qx] = BBu_; \ - GBu[dz][qy][qx] = GBu_; \ +#define CONVECTION3DPA_3 \ + Real_type BBu_ = 0.0; \ + Real_type GBu_ = 0.0; \ + Real_type BGu_ = 0.0; \ + for (Index_type dy = 0; dy < conv::D1D; ++dy) { \ + const Real_type bx = CPA_B(qy, dy); \ + const Real_type gx = CPA_G(qy, dy); \ + BBu_ += bx * Bu[dz][dy][qx]; \ + GBu_ += gx * Bu[dz][dy][qx]; \ + BGu_ += bx * Gu[dz][dy][qx]; \ + } \ + BBu[dz][qy][qx] = BBu_; \ + GBu[dz][qy][qx] = GBu_; \ BGu[dz][qy][qx] = BGu_; -#define CONVECTION3DPA_4 \ - Real_type GBBu_ = 0.0; \ - Real_type BGBu_ = 0.0; \ - Real_type BBGu_ = 0.0; \ - for (Index_type dz = 0; dz < CPA_D1D; ++dz) \ - { \ - const Real_type bx = CPA_B(qz,dz); \ - const Real_type gx = CPA_G(qz,dz); \ - GBBu_ += gx * BBu[dz][qy][qx]; \ - BGBu_ += bx * GBu[dz][qy][qx]; \ - BBGu_ += bx * BGu[dz][qy][qx]; \ - } \ - GBBu[qz][qy][qx] = GBBu_; \ - BGBu[qz][qy][qx] = BGBu_; \ +#define CONVECTION3DPA_4 \ + Real_type GBBu_ = 0.0; \ + Real_type BGBu_ = 0.0; \ + Real_type BBGu_ = 0.0; \ + for (Index_type dz = 0; dz < conv::D1D; ++dz) { \ + const Real_type bx = CPA_B(qz, dz); \ + const Real_type gx = CPA_G(qz, dz); \ + GBBu_ += gx * BBu[dz][qy][qx]; \ + BGBu_ += bx * GBu[dz][qy][qx]; \ + BBGu_ += bx * BGu[dz][qy][qx]; \ + } \ + GBBu[qz][qy][qx] = GBBu_; \ + BGBu[qz][qy][qx] = BGBu_; \ BBGu[qz][qy][qx] = BBGu_; -#define CONVECTION3DPA_5 \ - const Real_type O1 = CPA_op(qx,qy,qz,0,e); \ - const Real_type O2 = CPA_op(qx,qy,qz,1,e); \ - const Real_type O3 = CPA_op(qx,qy,qz,2,e); \ - const Real_type gradX = BBGu[qz][qy][qx]; \ - const Real_type gradY = BGBu[qz][qy][qx]; \ - const Real_type gradZ = GBBu[qz][qy][qx]; \ +#define CONVECTION3DPA_5 \ + const Real_type O1 = CPA_op(qx, qy, qz, 0, e); \ + const Real_type O2 = CPA_op(qx, qy, qz, 1, e); \ + const Real_type O3 = CPA_op(qx, qy, qz, 2, e); \ + const Real_type gradX = BBGu[qz][qy][qx]; \ + const Real_type gradY = BGBu[qz][qy][qx]; \ + const Real_type gradZ = GBBu[qz][qy][qx]; \ DGu[qz][qy][qx] = (O1 * gradX) + (O2 * gradY) + (O3 * gradZ); -#define CONVECTION3DPA_6 \ - Real_type BDGu_ = 0.0; \ - for (Index_type qz = 0; qz < CPA_Q1D; ++qz) \ - { \ - const Real_type w = CPA_Bt(dz,qz); \ - BDGu_ += w * DGu[qz][qy][qx]; \ - } \ - BDGu[dz][qy][qx] = BDGu_; +#define CONVECTION3DPA_6 \ + Real_type BDGu_ = 0.0; \ + for (Index_type qz = 0; qz < conv::Q1D; ++qz) { \ + const Real_type w = CPA_Bt(dz, qz); \ + BDGu_ += w * DGu[qz][qy][qx]; \ + } \ + BDGu[dz][qy][qx] = BDGu_; -#define CONVECTION3DPA_7 \ - Real_type BBDGu_ = 0.0; \ - for (Index_type qy = 0; qy < CPA_Q1D; ++qy) \ - { \ - const Real_type w = CPA_Bt(dy,qy); \ - BBDGu_ += w * BDGu[dz][qy][qx]; \ - } \ - BBDGu[dz][dy][qx] = BBDGu_; \ +#define CONVECTION3DPA_7 \ + Real_type BBDGu_ = 0.0; \ + for (Index_type qy = 0; qy < conv::Q1D; ++qy) { \ + const Real_type w = CPA_Bt(dy, qy); \ + BBDGu_ += w * BDGu[dz][qy][qx]; \ + } \ + BBDGu[dz][dy][qx] = BBDGu_; -#define CONVECTION3DPA_8 \ - Real_type BBBDGu = 0.0; \ - for (Index_type qx = 0; qx < CPA_Q1D; ++qx) \ - { \ - const Real_type w = CPA_Bt(dx,qx); \ - BBBDGu += w * BBDGu[dz][dy][qx]; \ - } \ - CPA_Y(dx,dy,dz,e) += BBBDGu; +#define CONVECTION3DPA_8 \ + Real_type BBBDGu = 0.0; \ + for (Index_type qx = 0; qx < conv::Q1D; ++qx) { \ + const Real_type w = CPA_Bt(dx, qx); \ + BBBDGu += w * BBDGu[dz][dy][qx]; \ + } \ + CPA_Y(dx, dy, dz, e) += BBBDGu; -namespace rajaperf -{ +namespace rajaperf { class RunParams; -namespace apps -{ +namespace apps { -class CONVECTION3DPA : public KernelBase -{ +class CONVECTION3DPA : public KernelBase { public: - - CONVECTION3DPA(const RunParams& params); + CONVECTION3DPA(const RunParams ¶ms); ~CONVECTION3DPA(); @@ -382,15 +378,13 @@ class CONVECTION3DPA : public KernelBase void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); - template < size_t work_group_size > - void runSyclVariantImpl(VariantID vid); + template void runCudaVariantImpl(VariantID vid); + template void runHipVariantImpl(VariantID vid); + template void runSyclVariantImpl(VariantID vid); private: - static const size_t default_gpu_block_size = CPA_Q1D * CPA_Q1D * CPA_Q1D; + static const size_t default_gpu_block_size = + conv::Q1D * conv::Q1D * conv::Q1D; using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 94d52e7a2..1340c123e 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -32,77 +32,78 @@ __global__ void Diffusion3DPA(const Real_ptr Basis, DIFFUSION3DPA_0_GPU; - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, diff::D1D) { DIFFUSION3DPA_1; } } } if (threadIdx.z == 0) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, diff::Q1D) { DIFFUSION3DPA_2; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, diff::Q1D) { DIFFUSION3DPA_3; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, diff::Q1D) { DIFFUSION3DPA_4; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, diff::Q1D) { DIFFUSION3DPA_5; } } } __syncthreads(); if (threadIdx.z == 0) { - GPU_FOREACH_THREAD(d, y, DPA_D1D) { - GPU_FOREACH_THREAD(q, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, diff::Q1D) { DIFFUSION3DPA_6; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, diff::D1D) { DIFFUSION3DPA_7; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, diff::D1D) { DIFFUSION3DPA_8; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, diff::D1D) { DIFFUSION3DPA_9; } } } + } template < size_t block_size > @@ -123,7 +124,7 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); + dim3 nthreads_per_block(diff::Q1D, diff::Q1D, diff::Q1D); constexpr size_t shmem = 0; RPlaunchCudaKernel( (Diffusion3DPA), @@ -141,27 +142,28 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { constexpr bool async = true; using launch_policy = - RAJA::LaunchPolicy>; + RAJA::LaunchPolicy>; using outer_x = RAJA::LoopPolicy; using inner_x = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using inner_y = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using inner_z = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)), + RAJA::Threads(diff::Q1D, diff::Q1D, diff::Q1D)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), @@ -169,11 +171,11 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { DIFFUSION3DPA_0_GPU; - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_1; @@ -189,9 +191,9 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_2; @@ -205,11 +207,11 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_3; @@ -223,11 +225,11 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_4; @@ -241,14 +243,14 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { - DIFFUSION3DPA_5; + DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::loop @@ -261,10 +263,10 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](Index_type q) { + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), + [&](Index_type qx) { DIFFUSION3DPA_6; @@ -277,11 +279,11 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_7; @@ -295,11 +297,11 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_8; @@ -313,11 +315,11 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_9; @@ -334,6 +336,7 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 5c0e73eab..185c31002 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -28,81 +28,82 @@ __global__ void Diffusion3DPA(const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { - const Index_type e = hipBlockIdx_x; + const Index_type e = blockIdx.x; DIFFUSION3DPA_0_GPU; - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, diff::D1D) { DIFFUSION3DPA_1; } } } if (threadIdx.z == 0) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, diff::Q1D) { DIFFUSION3DPA_2; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, diff::Q1D) { DIFFUSION3DPA_3; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, diff::Q1D) { DIFFUSION3DPA_4; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, diff::Q1D) { DIFFUSION3DPA_5; } } } __syncthreads(); if (threadIdx.z == 0) { - GPU_FOREACH_THREAD(d, y, DPA_D1D) { - GPU_FOREACH_THREAD(q, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, diff::Q1D) { DIFFUSION3DPA_6; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, diff::D1D) { DIFFUSION3DPA_7; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, diff::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, diff::D1D) { DIFFUSION3DPA_8; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, diff::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, diff::D1D) { DIFFUSION3DPA_9; } } } + } template < size_t block_size > @@ -123,7 +124,7 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); + dim3 nthreads_per_block(diff::Q1D, diff::Q1D, diff::Q1D); constexpr size_t shmem = 0; RPlaunchHipKernel( (Diffusion3DPA), @@ -141,27 +142,28 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { constexpr bool async = true; using launch_policy = - RAJA::LaunchPolicy>; + RAJA::LaunchPolicy>; using outer_x = RAJA::LoopPolicy; using inner_x = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using inner_y = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using inner_z = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D)), + RAJA::Threads(diff::Q1D, diff::Q1D, diff::Q1D)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), @@ -169,11 +171,11 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { DIFFUSION3DPA_0_GPU; - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_1; @@ -189,9 +191,9 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_2; @@ -205,11 +207,11 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_3; @@ -223,11 +225,11 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_4; @@ -241,14 +243,14 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { - DIFFUSION3DPA_5; + DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::loop @@ -261,10 +263,10 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](Index_type q) { + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), + [&](Index_type qx) { DIFFUSION3DPA_6; @@ -277,11 +279,11 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_7; @@ -295,11 +297,11 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_8; @@ -313,11 +315,11 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_9; @@ -334,6 +336,7 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index ebbef55ef..80a803898 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -38,70 +38,71 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_1; + CPU_FOREACH(dz,z,diff::D1D) { + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(dx,x,diff::D1D) { + DIFFUSION3DPA_1 } } } - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_2; + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(qx,x,diff::Q1D) { + DIFFUSION3DPA_2 } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_3; + CPU_FOREACH(dz,z,diff::D1D) { + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(qx,x,diff::Q1D) { + + DIFFUSION3DPA_3 } } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_4; + CPU_FOREACH(dz,z,diff::D1D) { + CPU_FOREACH(qy,y,diff::Q1D) { + CPU_FOREACH(qx,x,diff::Q1D) { + DIFFUSION3DPA_4 } } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_5; + CPU_FOREACH(qz,z,diff::Q1D) { + CPU_FOREACH(qy,y,diff::Q1D) { + CPU_FOREACH(qx,x,diff::Q1D) { + DIFFUSION3DPA_5 } } } - CPU_FOREACH(d, y, DPA_D1D) { - CPU_FOREACH(q, x, DPA_Q1D) { - DIFFUSION3DPA_6; + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(qx,x,diff::Q1D) { + DIFFUSION3DPA_6 } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_7; + CPU_FOREACH(qz,z,diff::Q1D) { + CPU_FOREACH(qy,y,diff::Q1D) { + CPU_FOREACH(dx,x,diff::D1D) { + DIFFUSION3DPA_7 } } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_8; + CPU_FOREACH(qz,z,diff::Q1D) { + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(dx,x,diff::D1D) { + DIFFUSION3DPA_8 } } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_9; + CPU_FOREACH(dz,z,diff::D1D) { + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(dx,x,diff::D1D) { + DIFFUSION3DPA_9 } } } @@ -133,6 +134,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { // Grid is empty as the host does not need a compute grid to be specified + //clang-format off RAJA::launch( res, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -142,11 +144,11 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_1; @@ -162,9 +164,9 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_2; @@ -178,11 +180,11 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_3; @@ -196,11 +198,11 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_4; @@ -214,14 +216,14 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { - DIFFUSION3DPA_5; + DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::loop @@ -234,12 +236,12 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](Index_type q) { + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), + [&](Index_type qx) { - DIFFUSION3DPA_6; + DIFFUSION3DPA_6; } // lambda (q) ); // RAJA::loop @@ -250,14 +252,14 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { - DIFFUSION3DPA_7; + DIFFUSION3DPA_7; } // lambda (dx) ); // RAJA::loop @@ -268,11 +270,11 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_8; @@ -286,11 +288,11 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_9; @@ -307,6 +309,8 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on + } // loop over kernel reps stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 8ebf3e0b0..b551f255e 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -35,70 +35,71 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_1; + CPU_FOREACH(dz,z,diff::D1D) { + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(dx,x,diff::D1D) { + DIFFUSION3DPA_1 } } } - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_2; + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(qx,x,diff::Q1D) { + DIFFUSION3DPA_2 } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_3; + CPU_FOREACH(dz,z,diff::D1D) { + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(qx,x,diff::Q1D) { + + DIFFUSION3DPA_3 } } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_4; + CPU_FOREACH(dz,z,diff::D1D) { + CPU_FOREACH(qy,y,diff::Q1D) { + CPU_FOREACH(qx,x,diff::Q1D) { + DIFFUSION3DPA_4 } } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_5; + CPU_FOREACH(qz,z,diff::Q1D) { + CPU_FOREACH(qy,y,diff::Q1D) { + CPU_FOREACH(qx,x,diff::Q1D) { + DIFFUSION3DPA_5 } } } - CPU_FOREACH(d, y, DPA_D1D) { - CPU_FOREACH(q, x, DPA_Q1D) { - DIFFUSION3DPA_6; + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(qx,x,diff::Q1D) { + DIFFUSION3DPA_6 } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_7; + CPU_FOREACH(qz,z,diff::Q1D) { + CPU_FOREACH(qy,y,diff::Q1D) { + CPU_FOREACH(dx,x,diff::D1D) { + DIFFUSION3DPA_7 } } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_8; + CPU_FOREACH(qz,z,diff::Q1D) { + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(dx,x,diff::D1D) { + DIFFUSION3DPA_8 } } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_9; + CPU_FOREACH(dz,z,diff::D1D) { + CPU_FOREACH(dy,y,diff::D1D) { + CPU_FOREACH(dx,x,diff::D1D) { + DIFFUSION3DPA_9 } } } @@ -131,6 +132,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { // Grid is empty as the host does not need a compute grid to be specified + //clang-format off RAJA::launch( res, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -140,11 +142,11 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_1; @@ -160,9 +162,9 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_2; @@ -176,11 +178,11 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_3; @@ -194,11 +196,11 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_4; @@ -212,11 +214,11 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_5; @@ -232,10 +234,10 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](Index_type q) { + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), + [&](Index_type qx) { DIFFUSION3DPA_6; @@ -248,11 +250,11 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_7; @@ -266,11 +268,11 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_8; @@ -284,11 +286,11 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_9; @@ -305,6 +307,7 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-Sycl.cpp b/src/apps/DIFFUSION3DPA-Sycl.cpp index 770191d51..afe90ea37 100644 --- a/src/apps/DIFFUSION3DPA-Sycl.cpp +++ b/src/apps/DIFFUSION3DPA-Sycl.cpp @@ -37,8 +37,8 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { case Base_SYCL: { - const ::sycl::range<3> workGroupSize(DPA_Q1D, DPA_Q1D, DPA_Q1D); - const ::sycl::range<3> gridSize(DPA_Q1D,DPA_Q1D,DPA_Q1D*NE); + const ::sycl::range<3> workGroupSize(diff::Q1D, diff::Q1D, diff::Q1D); + const ::sycl::range<3> gridSize(diff::Q1D,diff::Q1D,diff::Q1D*NE); startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning @@ -46,8 +46,8 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { qu->submit([&](::sycl::handler& h) { - constexpr Index_type MQ1 = DPA_Q1D; - constexpr Index_type MD1 = DPA_D1D; + constexpr Index_type MQ1 = diff::Q1D; + constexpr Index_type MD1 = diff::D1D; constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; auto sBG_vec = ::sycl::local_accessor(::sycl::range<1>(MQ1*MD1), h); @@ -95,75 +95,72 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { Real_type (*QDD1)[MD1][MD1] = (Real_type (*)[MD1][MD1]) (sm0_1); Real_type (*QDD2)[MD1][MD1] = (Real_type (*)[MD1][MD1]) (sm0_2); - SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { - SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { - SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + SYCL_FOREACH_THREAD(dz, 0, diff::D1D) { + SYCL_FOREACH_THREAD(dy, 1, diff::D1D) { + SYCL_FOREACH_THREAD(dx, 2, diff::D1D) { DIFFUSION3DPA_1; } } } - - if (itm.get_local_id(0) == 0) - { - SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { - SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + if (itm.get_local_id(0) == 0) { + SYCL_FOREACH_THREAD(dy, 1, diff::D1D) { + SYCL_FOREACH_THREAD(qx, 2, diff::Q1D) { DIFFUSION3DPA_2; } } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { - SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { - SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + SYCL_FOREACH_THREAD(dz, 0, diff::D1D) { + SYCL_FOREACH_THREAD(dy, 1, diff::D1D) { + SYCL_FOREACH_THREAD(qx, 2, diff::Q1D) { DIFFUSION3DPA_3; } } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { - SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) { - SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + SYCL_FOREACH_THREAD(dz, 0, diff::D1D) { + SYCL_FOREACH_THREAD(qy, 1, diff::Q1D) { + SYCL_FOREACH_THREAD(qx, 2, diff::Q1D) { DIFFUSION3DPA_4; } } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) { - SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) { - SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + SYCL_FOREACH_THREAD(qz, 0, diff::Q1D) { + SYCL_FOREACH_THREAD(qy, 1, diff::Q1D) { + SYCL_FOREACH_THREAD(qx, 2, diff::Q1D) { DIFFUSION3DPA_5; } } } itm.barrier(::sycl::access::fence_space::local_space); - if (itm.get_local_id(0) == 0) - { - SYCL_FOREACH_THREAD(d, 1, DPA_D1D) { - SYCL_FOREACH_THREAD(q, 2, DPA_Q1D) { + if (itm.get_local_id(0) == 0) { + SYCL_FOREACH_THREAD(dy, 1, diff::D1D) { + SYCL_FOREACH_THREAD(qx, 2, diff::Q1D) { DIFFUSION3DPA_6; } } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) { - SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) { - SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + SYCL_FOREACH_THREAD(qz, 0, diff::Q1D) { + SYCL_FOREACH_THREAD(qy, 1, diff::Q1D) { + SYCL_FOREACH_THREAD(dx, 2, diff::D1D) { DIFFUSION3DPA_7; } } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) { - SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { - SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + SYCL_FOREACH_THREAD(qz, 0, diff::Q1D) { + SYCL_FOREACH_THREAD(dy, 1, diff::D1D) { + SYCL_FOREACH_THREAD(dx, 2, diff::D1D) { DIFFUSION3DPA_8; } } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { - SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { - SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + SYCL_FOREACH_THREAD(dz, 0, diff::D1D) { + SYCL_FOREACH_THREAD(dy, 1, diff::D1D) { + SYCL_FOREACH_THREAD(dx, 2, diff::D1D) { DIFFUSION3DPA_9; } } @@ -200,8 +197,8 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { size_t shmem = 0; { - constexpr Index_type MQ1 = DPA_Q1D; - constexpr Index_type MD1 = DPA_D1D; + constexpr Index_type MQ1 = diff::Q1D; + constexpr Index_type MD1 = diff::D1D; constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; const size_t local_mats = 6; @@ -212,9 +209,10 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D), shmem), + RAJA::Threads(diff::Q1D, diff::Q1D, diff::Q1D), shmem), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { const bool symmetric = true; @@ -223,8 +221,8 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { [&](Index_type e) { //Redefine inside the lambda to keep consistent with base version - constexpr Index_type MQ1 = DPA_Q1D; - constexpr Index_type MD1 = DPA_D1D; + constexpr Index_type MQ1 = diff::Q1D; + constexpr Index_type MD1 = diff::D1D; constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; Real_ptr sBG = ctx.getSharedMemory(MQ1*MD1); @@ -256,11 +254,11 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { Real_type (*QDD1)[MD1][MD1] = (Real_type (*)[MD1][MD1]) (sm0_1); Real_type (*QDD2)[MD1][MD1] = (Real_type (*)[MD1][MD1]) (sm0_2); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_1; @@ -272,12 +270,13 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { } // lambda (dz) ); //RAJA::loop + ctx.teamSync(); RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_2; @@ -291,11 +290,11 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_3; @@ -309,11 +308,11 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { DIFFUSION3DPA_4; @@ -327,14 +326,14 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qx) { - DIFFUSION3DPA_5; + DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::loop @@ -347,10 +346,10 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](Index_type q) { + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), + [&](Index_type qx) { DIFFUSION3DPA_6; @@ -363,11 +362,11 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_7; @@ -381,11 +380,11 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_8; @@ -399,11 +398,11 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, diff::D1D), [&](Index_type dx) { DIFFUSION3DPA_9; @@ -420,7 +419,7 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch - + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 118245fa2..347949aff 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -25,33 +25,29 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) { m_NE_default = 15625; - setDefaultProblemSize(m_NE_default*DPA_Q1D*DPA_Q1D*DPA_Q1D); + setDefaultProblemSize(m_NE_default*diff::D1D*diff::D1D*diff::D1D); setDefaultReps(50); - m_NE = std::max((getTargetProblemSize() + (DPA_Q1D*DPA_Q1D*DPA_Q1D)/2) / (DPA_Q1D*DPA_Q1D*DPA_Q1D), Index_type(1)); + m_NE = std::max((getTargetProblemSize() + (diff::D1D*diff::D1D*diff::D1D)/2) / (diff::D1D*diff::D1D*diff::D1D), Index_type(1)); - setActualProblemSize( m_NE*DPA_Q1D*DPA_Q1D*DPA_Q1D ); + setActualProblemSize( m_NE*diff::D1D*diff::D1D*diff::D1D ); - setItsPerRep( m_NE*DPA_Q1D*DPA_Q1D*DPA_Q1D ); + setItsPerRep( m_NE*diff::D1D*diff::D1D*diff::D1D ); setKernelsPerRep(1); - setBytesReadPerRep( 2*sizeof(Real_type) * DPA_Q1D*DPA_D1D + // b, g - 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE + // x - DPA_SYM*sizeof(Real_type) * DPA_Q1D*DPA_Q1D*DPA_Q1D*m_NE ); // d + setBytesReadPerRep( 2*sizeof(Real_type) * diff::Q1D*diff::D1D + // b, g + 1*sizeof(Real_type) * diff::D1D*diff::D1D*diff::D1D*m_NE + // x + diff::DPA_SYM*sizeof(Real_type) * diff::Q1D*diff::Q1D*diff::Q1D*m_NE ); // d setBytesWrittenPerRep( 0 ); - setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE ); // y + setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * diff::D1D*diff::D1D*diff::D1D*m_NE ); // y setBytesAtomicModifyWrittenPerRep( 0 ); - setFLOPsPerRep(m_NE * (DPA_Q1D * DPA_D1D + - 5 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_D1D + - 7 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_Q1D + - 7 * DPA_Q1D * DPA_D1D * DPA_Q1D * DPA_Q1D + - 15 * DPA_Q1D * DPA_Q1D * DPA_Q1D + - DPA_Q1D * DPA_D1D + - 7 * DPA_Q1D * DPA_Q1D * DPA_D1D * DPA_Q1D + - 7 * DPA_Q1D * DPA_Q1D * DPA_D1D * DPA_D1D + - 7 * DPA_D1D * DPA_Q1D * DPA_D1D * DPA_D1D + - 3 * DPA_D1D * DPA_D1D * DPA_D1D)); + setFLOPsPerRep(m_NE * (4 * diff::D1D * diff::D1D * diff::D1D + //DIFFUSION3DPA_3 + 6 * diff::D1D * diff::Q1D * diff::Q1D + //DIFFUSION3DPA_4 + (6 * diff::D1D + 15) * diff::Q1D * diff::Q1D * diff::Q1D + //DIFFUSION3DPA_5 + (6 * diff::Q1D) * diff::D1D * diff::Q1D * diff::Q1D + //DIFFUSION3DPA_7 + (6 * diff::Q1D) * diff::D1D * diff::D1D * diff::Q1D + //DIFFUSION3DPA_8 + (6 * diff::Q1D + 1)*diff::D1D*diff::D1D*diff::D1D)); //DIFFUSION3DPA_9 setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); setChecksumTolerance(ChecksumTolerance::normal); @@ -70,16 +66,16 @@ DIFFUSION3DPA::~DIFFUSION3DPA() void DIFFUSION3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - allocAndInitDataConst(m_B, Index_type(DPA_Q1D*DPA_D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_G, Index_type(DPA_Q1D*DPA_D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_D, Index_type(DPA_Q1D*DPA_Q1D*DPA_Q1D*DPA_SYM*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_X, Index_type(DPA_D1D*DPA_D1D*DPA_D1D*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_Y, Index_type(DPA_D1D*DPA_D1D*DPA_D1D*m_NE), Real_type(0.0), vid); + allocAndInitDataConst(m_B, Index_type(diff::Q1D*diff::D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_G, Index_type(diff::Q1D*diff::D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_D, Index_type(diff::Q1D*diff::Q1D*diff::Q1D*diff::DPA_SYM*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_X, Index_type(diff::D1D*diff::D1D*diff::D1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_Y, Index_type(diff::D1D*diff::D1D*diff::D1D*m_NE), Real_type(0.0), vid); } void DIFFUSION3DPA::updateChecksum(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - addToChecksum(m_Y, DPA_D1D*DPA_D1D*DPA_D1D*m_NE, vid); + addToChecksum(m_Y, diff::D1D*diff::D1D*diff::D1D*m_NE, vid); } void DIFFUSION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index ecb96696a..5a9e6be95 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -6,455 +6,410 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +//clang-format off /// /// Action of 3D diffusion matrix via partial assembly /// /// Based on MFEM's/CEED algorithms. -/// Reference implementation -/// https://github.com/mfem/mfem/blob/master/fem/bilininteg_diffusion_pa.cpp +/// Reference implementation - MFEM-v4.9 +/// https://github.com/mfem/mfem/blob/v4.9/fem/integ/bilininteg_diffusion_kernels.hpp /// -/// for (int e = 0; e < NE; ++e) { +/// for (Index_type e = 0; e < NE; ++e) { /// -/// constexpr int MQ1 = DPA_Q1D; -/// constexpr int MD1 = DPA_D1D; -/// constexpr int MDQ = (MQ1 > ? MQ1 : MD1; -/// double sBG[MQ1*MD1]; -/// double (*B)[MD1] = (double (*)[MD1]) sBG; -/// double (*G)[MD1] = (double (*)[MD1]) sBG; -/// double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; -/// double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; -/// double sm0[3][MDQ*MDQ*MDQ]; -/// double sm1[3][MDQ*MDQ*MDQ]; -/// double (*X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); -/// double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); -/// double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); -/// double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); -/// double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); -/// double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); -/// double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); -/// double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); -/// double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); -/// double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); -/// double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); -/// double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); -/// double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); -/// double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); -/// double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); +/// constexpr Index_type MQ1 = diff::Q1D; +/// constexpr Index_type MD1 = diff::D1D; +/// constexpr Index_type MDQ = (MQ1 > ? MQ1 : MD1; +/// Real_type sBG[MQ1*MD1]; +/// Real_type (*B)[MD1] = (Real_type (*)[MD1]) sBG; +/// Real_type (*G)[MD1] = (Real_type (*)[MD1]) sBG; +/// Real_type (*Bt)[MQ1] = (Real_type (*)[MQ1]) sBG; +/// Real_type (*Gt)[MQ1] = (Real_type (*)[MQ1]) sBG; +/// Real_type sm0[3][MDQ*MDQ*MDQ]; +/// Real_type sm1[3][MDQ*MDQ*MDQ]; +/// Real_type (*X)[MD1][MD1] = (Real_type (*)[MD1][MD1]) (sm0+2); +/// Real_type (*DDQ0)[MD1][MQ1] = (Real_type (*)[MD1][MQ1]) (sm0+0); +/// Real_type (*DDQ1)[MD1][MQ1] = (Real_type (*)[MD1][MQ1]) (sm0+1); +/// Real_type (*DQQ0)[MQ1][MQ1] = (Real_type (*)[MQ1][MQ1]) (sm1+0); +/// Real_type (*DQQ1)[MQ1][MQ1] = (Real_type (*)[MQ1][MQ1]) (sm1+1); +/// Real_type (*DQQ2)[MQ1][MQ1] = (Real_type (*)[MQ1][MQ1]) (sm1+2); +/// Real_type (*QQQ0)[MQ1][MQ1] = (Real_type (*)[MQ1][MQ1]) (sm0+0); +/// Real_type (*QQQ1)[MQ1][MQ1] = (Real_type (*)[MQ1][MQ1]) (sm0+1); +/// Real_type (*QQQ2)[MQ1][MQ1] = (Real_type (*)[MQ1][MQ1]) (sm0+2); +/// Real_type (*QQD0)[MQ1][MD1] = (Real_type (*)[MQ1][MD1]) (sm1+0); +/// Real_type (*QQD1)[MQ1][MD1] = (Real_type (*)[MQ1][MD1]) (sm1+1); +/// Real_type (*QQD2)[MQ1][MD1] = (Real_type (*)[MQ1][MD1]) (sm1+2); +/// Real_type (*QDD0)[MD1][MD1] = (Real_type (*)[MD1][MD1]) (sm0+0); +/// Real_type (*QDD1)[MD1][MD1] = (Real_type (*)[MD1][MD1]) (sm0+1); +/// Real_type (*QDD2)[MD1][MD1] = (Real_type (*)[MD1][MD1]) (sm0+2); /// -/// for(int dz=0;dz MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED double sBG[MQ1*MD1]; \ - double (*B)[MD1] = (double (*)[MD1]) sBG; \ - double (*G)[MD1] = (double (*)[MD1]) sBG; \ - double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \ - double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \ - RAJA_TEAM_SHARED double sm0[3][MDQ*MDQ*MDQ]; \ - RAJA_TEAM_SHARED double sm1[3][MDQ*MDQ*MDQ]; \ - double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); \ - double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \ - double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \ - double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \ - double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \ - double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \ - double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \ - double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \ - double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \ - double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \ - double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \ - double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \ - double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \ - double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ - double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); - -#define DIFFUSION3DPA_0_CPU \ - constexpr int MQ1 = DPA_Q1D; \ - constexpr int MD1 = DPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - double sBG[MQ1*MD1]; \ - double (*B)[MD1] = (double (*)[MD1]) sBG; \ - double (*G)[MD1] = (double (*)[MD1]) sBG; \ - double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \ - double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \ - double sm0[3][MDQ*MDQ*MDQ]; \ - double sm1[3][MDQ*MDQ*MDQ]; \ - double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); \ - double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \ - double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \ - double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \ - double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \ - double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \ - double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \ - double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \ - double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \ - double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \ - double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \ - double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \ - double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \ - double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ - double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); - -#define DIFFUSION3DPA_1 \ - s_X[dz][dy][dx] = DPA_X(dx,dy,dz,e); - -#define DIFFUSION3DPA_2 \ - const Index_type i = DPA_qi(qx,dy,DPA_Q1D); \ - const Index_type j = DPA_dj(qx,dy,DPA_D1D); \ - const Index_type k = DPA_qk(qx,dy,DPA_Q1D); \ - const Index_type l = DPA_dl(qx,dy,DPA_D1D); \ - B[i][j] = DPA_b(qx,dy); \ - G[k][l] = DPA_g(qx,dy) * DPA_sign(qx,dy); \ - -#define DIFFUSION3DPA_3 \ - Real_type u = 0.0, v = 0.0; \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dx = 0; dx < DPA_D1D; ++dx) \ - { \ - const Index_type i = DPA_qi(qx,dx,DPA_Q1D); \ - const Index_type j = DPA_dj(qx,dx,DPA_D1D); \ - const Index_type k = DPA_qk(qx,dx,DPA_Q1D); \ - const Index_type l = DPA_dl(qx,dx,DPA_D1D); \ - const Real_type s = DPA_sign(qx,dx); \ - const Real_type coords = s_X[dz][dy][dx]; \ - u += coords * B[i][j]; \ - v += coords * G[k][l] * s; \ - } \ - DDQ0[dz][dy][qx] = u; \ - DDQ1[dz][dy][qx] = v; - -#define DIFFUSION3DPA_4 \ - Real_type u = 0.0, v = 0.0, w = 0.0; \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dy = 0; dy < DPA_D1D; ++dy) \ - { \ - const Index_type i = DPA_qi(qy,dy,DPA_Q1D); \ - const Index_type j = DPA_dj(qy,dy,DPA_D1D); \ - const Index_type k = DPA_qk(qy,dy,DPA_Q1D); \ - const Index_type l = DPA_dl(qy,dy,DPA_D1D); \ - const Real_type s = DPA_sign(qy,dy); \ - u += DDQ1[dz][dy][qx] * B[i][j]; \ - v += DDQ0[dz][dy][qx] * G[k][l] * s; \ - w += DDQ0[dz][dy][qx] * B[i][j]; \ - } \ - DQQ0[dz][qy][qx] = u; \ - DQQ1[dz][qy][qx] = v; \ - DQQ2[dz][qy][qx] = w; - -#define DIFFUSION3DPA_5 \ - Real_type u = 0.0, v = 0.0, w = 0.0; \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dz = 0; dz < DPA_D1D; ++dz) \ - { \ - const Index_type i = DPA_qi(qz,dz,DPA_Q1D); \ - const Index_type j = DPA_dj(qz,dz,DPA_D1D); \ - const Index_type k = DPA_qk(qz,dz,DPA_Q1D); \ - const Index_type l = DPA_dl(qz,dz,DPA_D1D); \ - const Real_type s = DPA_sign(qz,dz); \ - u += DQQ0[dz][qy][qx] * B[i][j]; \ - v += DQQ1[dz][qy][qx] * B[i][j]; \ - w += DQQ2[dz][qy][qx] * G[k][l] * s; \ - } \ - const Real_type O11 = DPA_d(qx,qy,qz,0,e); \ - const Real_type O12 = DPA_d(qx,qy,qz,1,e); \ - const Real_type O13 = DPA_d(qx,qy,qz,2,e); \ - const Real_type O21 = symmetric ? O12 : DPA_d(qx,qy,qz,3,e); \ - const Real_type O22 = symmetric ? DPA_d(qx,qy,qz,3,e) : DPA_d(qx,qy,qz,4,e); \ - const Real_type O23 = symmetric ? DPA_d(qx,qy,qz,4,e) : DPA_d(qx,qy,qz,5,e); \ - const Real_type O31 = symmetric ? O13 : DPA_d(qx,qy,qz,6,e); \ - const Real_type O32 = symmetric ? O23 : DPA_d(qx,qy,qz,7,e); \ - const Real_type O33 = symmetric ? DPA_d(qx,qy,qz,5,e) : DPA_d(qx,qy,qz,8,e); \ - const Real_type gX = u; \ - const Real_type gY = v; \ - const Real_type gZ = w; \ - QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ); \ - QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ); \ - QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ); - -#define DIFFUSION3DPA_6 \ - const Index_type i = DPA_qi(q,d,DPA_Q1D); \ - const Index_type j = DPA_dj(q,d,DPA_D1D); \ - const Index_type k = DPA_qk(q,d,DPA_Q1D); \ - const Index_type l = DPA_dl(q,d,DPA_D1D); \ - Bt[j][i] = DPA_b(q,d); \ - Gt[l][k] = DPA_g(q,d) * DPA_sign(q,d); - -#define DIFFUSION3DPA_7 \ - Real_type u = 0.0, v = 0.0, w = 0.0; \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qx = 0; qx < DPA_Q1D; ++qx) \ - { \ - const Index_type i = DPA_qi(qx,dx,DPA_Q1D); \ - const Index_type j = DPA_dj(qx,dx,DPA_D1D); \ - const Index_type k = DPA_qk(qx,dx,DPA_Q1D); \ - const Index_type l = DPA_dl(qx,dx,DPA_D1D); \ - const Real_type s = DPA_sign(qx,dx); \ - u += QQQ0[qz][qy][qx] * Gt[l][k] * s; \ - v += QQQ1[qz][qy][qx] * Bt[j][i]; \ - w += QQQ2[qz][qy][qx] * Bt[j][i]; \ - } \ - QQD0[qz][qy][dx] = u; \ - QQD1[qz][qy][dx] = v; \ - QQD2[qz][qy][dx] = w; - -#define DIFFUSION3DPA_8 \ - Real_type u = 0.0, v = 0.0, w = 0.0; \ - RAJAPERF_UNROLL(DPA_Q1D) \ - for (Index_type qy = 0; qy < DPA_Q1D; ++qy) \ - { \ - const Index_type i = DPA_qi(qy,dy,DPA_Q1D); \ - const Index_type j = DPA_dj(qy,dy,DPA_D1D); \ - const Index_type k = DPA_qk(qy,dy,DPA_Q1D); \ - const Index_type l = DPA_dl(qy,dy,DPA_D1D); \ - const Real_type s = DPA_sign(qy,dy); \ - u += QQD0[qz][qy][dx] * Bt[j][i]; \ - v += QQD1[qz][qy][dx] * Gt[l][k] * s; \ - w += QQD2[qz][qy][dx] * Bt[j][i]; \ - } \ - QDD0[qz][dy][dx] = u; \ - QDD1[qz][dy][dx] = v; \ - QDD2[qz][dy][dx] = w; - -#define DIFFUSION3DPA_9 \ - Real_type u = 0.0, v = 0.0, w = 0.0; \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qz = 0; qz < DPA_Q1D; ++qz) \ - { \ - const Index_type i = DPA_qi(qz,dz,DPA_Q1D); \ - const Index_type j = DPA_dj(qz,dz,DPA_D1D); \ - const Index_type k = DPA_qk(qz,dz,DPA_Q1D); \ - const Index_type l = DPA_dl(qz,dz,DPA_D1D); \ - const Real_type s = DPA_sign(qz,dz); \ - u += QDD0[qz][dy][dx] * Bt[j][i]; \ - v += QDD1[qz][dy][dx] * Bt[j][i]; \ - w += QDD2[qz][dy][dx] * Gt[l][k] * s; \ - } \ - DPA_Y(dx,dy,dz,e) += (u + v + w); - -namespace rajaperf -{ +#define DPA_qi(q, d, Q) (((q) <= (d)) ? (q) : (Q) - 1 - (q)) +#define DPA_dj(q, d, D) (((q) <= (d)) ? (d) : (D) - 1 - (d)) +#define DPA_qk(q, d, Q) (((q) <= (d)) ? (Q) - 1 - (q) : (q)) +#define DPA_dl(q, d, D) (((q) <= (d)) ? (D) - 1 - (d) : (d)) +#define DPA_sign(q, d) (((q) <= (d)) ? -1.0 : 1.0) + +#define DIFFUSION3DPA_0_GPU \ + constexpr Index_type MQ1 = diff::Q1D; \ + constexpr Index_type MD1 = diff::D1D; \ + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_type sBG[MQ1 * MD1]; \ + Real_type(*B)[MD1] = (Real_type(*)[MD1])sBG; \ + Real_type(*G)[MD1] = (Real_type(*)[MD1])sBG; \ + Real_type(*Bt)[MQ1] = (Real_type(*)[MQ1])sBG; \ + Real_type(*Gt)[MQ1] = (Real_type(*)[MQ1])sBG; \ + RAJA_TEAM_SHARED Real_type sm0[3][MDQ * MDQ * MDQ]; \ + RAJA_TEAM_SHARED Real_type sm1[3][MDQ * MDQ * MDQ]; \ + Real_type(*s_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); \ + Real_type(*DDQ0)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 0); \ + Real_type(*DDQ1)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 1); \ + Real_type(*DQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 0); \ + Real_type(*DQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 1); \ + Real_type(*DQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 2); \ + Real_type(*QQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 0); \ + Real_type(*QQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 1); \ + Real_type(*QQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 2); \ + Real_type(*QQD0)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 0); \ + Real_type(*QQD1)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 1); \ + Real_type(*QQD2)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 2); \ + Real_type(*QDD0)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 0); \ + Real_type(*QDD1)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 1); \ + Real_type(*QDD2)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); + +#define DIFFUSION3DPA_0_CPU \ + constexpr Index_type MQ1 = diff::Q1D; \ + constexpr Index_type MD1 = diff::D1D; \ + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + Real_type sBG[MQ1 * MD1]; \ + Real_type(*B)[MD1] = (Real_type(*)[MD1])sBG; \ + Real_type(*G)[MD1] = (Real_type(*)[MD1])sBG; \ + Real_type(*Bt)[MQ1] = (Real_type(*)[MQ1])sBG; \ + Real_type(*Gt)[MQ1] = (Real_type(*)[MQ1])sBG; \ + Real_type sm0[3][MDQ * MDQ * MDQ]; \ + Real_type sm1[3][MDQ * MDQ * MDQ]; \ + Real_type(*s_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); \ + Real_type(*DDQ0)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 0); \ + Real_type(*DDQ1)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 1); \ + Real_type(*DQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 0); \ + Real_type(*DQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 1); \ + Real_type(*DQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 2); \ + Real_type(*QQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 0); \ + Real_type(*QQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 1); \ + Real_type(*QQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 2); \ + Real_type(*QQD0)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 0); \ + Real_type(*QQD1)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 1); \ + Real_type(*QQD2)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 2); \ + Real_type(*QDD0)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 0); \ + Real_type(*QDD1)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 1); \ + Real_type(*QDD2)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); + +#define DIFFUSION3DPA_1 s_X[dz][dy][dx] = DPA_X(dx, dy, dz, e); + +#define DIFFUSION3DPA_2 \ + B[qx][dy] = DPA_b(qx, dy); \ + G[qx][dy] = DPA_g(qx, dy); + +#define DIFFUSION3DPA_3 \ + Real_type u = 0.0, v = 0.0; \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dx = 0; dx < diff::D1D; ++dx) { \ + const Real_type coords = s_X[dz][dy][dx]; \ + u += coords * B[qx][dx]; \ + v += coords * G[qx][dx]; \ + } \ + DDQ0[dz][dy][qx] = u; \ + DDQ1[dz][dy][qx] = v; + +#define DIFFUSION3DPA_4 \ + Real_type u = 0.0, v = 0.0, w = 0.0; \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dy = 0; dy < diff::D1D; ++dy) { \ + u += DDQ1[dz][dy][qx] * B[qy][dy]; \ + v += DDQ0[dz][dy][qx] * G[qy][dy]; \ + w += DDQ0[dz][dy][qx] * B[qy][dy]; \ + } \ + DQQ0[dz][qy][qx] = u; \ + DQQ1[dz][qy][qx] = v; \ + DQQ2[dz][qy][qx] = w; + +#define DIFFUSION3DPA_5 \ + Real_type u = 0.0, v = 0.0, w = 0.0; \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < diff::D1D; ++dz) { \ + u += DQQ0[dz][qy][qx] * B[qz][dz]; \ + v += DQQ1[dz][qy][qx] * B[qz][dz]; \ + w += DQQ2[dz][qy][qx] * G[qz][dz]; \ + } \ + const Real_type O11 = DPA_d(qx, qy, qz, 0, e); \ + const Real_type O12 = DPA_d(qx, qy, qz, 1, e); \ + const Real_type O13 = DPA_d(qx, qy, qz, 2, e); \ + const Real_type O21 = symmetric ? O12 : DPA_d(qx, qy, qz, 3, e); \ + const Real_type O22 = \ + symmetric ? DPA_d(qx, qy, qz, 3, e) : DPA_d(qx, qy, qz, 4, e); \ + const Real_type O23 = \ + symmetric ? DPA_d(qx, qy, qz, 4, e) : DPA_d(qx, qy, qz, 5, e); \ + const Real_type O31 = symmetric ? O13 : DPA_d(qx, qy, qz, 6, e); \ + const Real_type O32 = symmetric ? O23 : DPA_d(qx, qy, qz, 7, e); \ + const Real_type O33 = \ + symmetric ? DPA_d(qx, qy, qz, 5, e) : DPA_d(qx, qy, qz, 8, e); \ + const Real_type gX = u; \ + const Real_type gY = v; \ + const Real_type gZ = w; \ + QQQ0[qz][qy][qx] = (O11 * gX) + (O12 * gY) + (O13 * gZ); \ + QQQ1[qz][qy][qx] = (O21 * gX) + (O22 * gY) + (O23 * gZ); \ + QQQ2[qz][qy][qx] = (O31 * gX) + (O32 * gY) + (O33 * gZ); + +#define DIFFUSION3DPA_6 \ + Bt[dy][qx] = DPA_b(qx, dy); \ + Gt[dy][qx] = DPA_g(qx, dy); + +#define DIFFUSION3DPA_7 \ + Real_type u = 0.0, v = 0.0, w = 0.0; \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qx = 0; qx < diff::Q1D; ++qx) { \ + u += QQQ0[qz][qy][qx] * Gt[dx][qx]; \ + v += QQQ1[qz][qy][qx] * Bt[dx][qx]; \ + w += QQQ2[qz][qy][qx] * Bt[dx][qx]; \ + } \ + QQD0[qz][qy][dx] = u; \ + QQD1[qz][qy][dx] = v; \ + QQD2[qz][qy][dx] = w; + +#define DIFFUSION3DPA_8 \ + Real_type u = 0.0, v = 0.0, w = 0.0; \ + RAJAPERF_UNROLL(diff::Q1D) \ + for (Index_type qy = 0; qy < diff::Q1D; ++qy) { \ + u += QQD0[qz][qy][dx] * Bt[dy][qy]; \ + v += QQD1[qz][qy][dx] * Gt[dy][qy]; \ + w += QQD2[qz][qy][dx] * Bt[dy][qy]; \ + } \ + QDD0[qz][dy][dx] = u; \ + QDD1[qz][dy][dx] = v; \ + QDD2[qz][dy][dx] = w; + +#define DIFFUSION3DPA_9 \ + Real_type u = 0.0, v = 0.0, w = 0.0; \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < diff::Q1D; ++qz) { \ + u += QDD0[qz][dy][dx] * Bt[dz][qz]; \ + v += QDD1[qz][dy][dx] * Bt[dz][qz]; \ + w += QDD2[qz][dy][dx] * Gt[dz][qz]; \ + } \ + DPA_Y(dx, dy, dz, e) += (u + v + w); + +namespace rajaperf { class RunParams; -namespace apps -{ +namespace apps { -class DIFFUSION3DPA : public KernelBase -{ +class DIFFUSION3DPA : public KernelBase { public: - - DIFFUSION3DPA(const RunParams& params); + DIFFUSION3DPA(const RunParams ¶ms); ~DIFFUSION3DPA(); @@ -471,15 +426,13 @@ class DIFFUSION3DPA : public KernelBase void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); - template < size_t work_group_size > - void runSyclVariantImpl(VariantID vid); + template void runCudaVariantImpl(VariantID vid); + template void runHipVariantImpl(VariantID vid); + template void runSyclVariantImpl(VariantID vid); private: - static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D * DPA_Q1D; + static const size_t default_gpu_block_size = + diff::Q1D * diff::Q1D * diff::Q1D; using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index 03e98d84d..9e8b5dc73 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -24,11 +24,16 @@ #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) #define GPU_FOREACH_THREAD(i, k, N) \ for (Index_type i = threadIdx.k; i < N; i += blockDim.k) + +#define GPU_FOREACH_THREAD_DIRECT(i, k, N) \ + if (const Index_type i = threadIdx.k; i < N) #endif #if defined(RAJA_ENABLE_SYCL) #define SYCL_FOREACH_THREAD(i, k, N) \ for (Index_type i = itm.get_local_id(k); i < N; i += itm.get_local_range(k)) +#define SYCL_FOREACH_THREAD_DIRECT(i, k, N) \ + if (const Index_type i = itm.get_local_id(k); i < N) #endif #if defined(RAJA_ENABLE_SYCL) @@ -52,44 +57,44 @@ #define CPU_FOREACH(i, k, N) for (Index_type i = 0; i < N; i++) #define SHARED_LOOP_2D(tx, ty, Nx, Ny) \ - for (int ty = 0; ty < Ny; ty++) \ - for (int tx = 0; tx < Nx; tx++) + for (Index_type ty = 0; ty < Ny; ty++) \ + for (Index_type tx = 0; tx < Nx; tx++) #define SHARED_LOOP_3D(tx, ty, tz, Nx, Ny, Nz) \ - for (int tz = 0; tz < Nz; tz++) \ - for (int ty = 0; ty < Ny; ty++) \ - for (int tx = 0; tx < Nx; tx++) + for (Index_type tz = 0; tz < Nz; tz++) \ + for (Index_type ty = 0; ty < Ny; ty++) \ + for (Index_type tx = 0; tx < Nx; tx++) #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) #define GPU_SHARED_DIRECT_2D(tx, ty, Nx, Ny) \ if (threadIdx.z < 1) \ - if (const int ty = threadIdx.y; ty < Ny) \ - if (const int tx = threadIdx.x; tx < Nx) + if (const Index_type ty = threadIdx.y; ty < Ny) \ + if (const Index_type tx = threadIdx.x; tx < Nx) #define GPU_SHARED_DIRECT_3D(tx, ty, tz, Nx, Ny, Nz) \ - if (const int tz = threadIdx.z; tz < Nz) \ - if (const int ty = threadIdx.y; ty < Ny) \ - if (const int tx = threadIdx.x; tx < Nx) + if (const Index_type tz = threadIdx.z; tz < Nz) \ + if (const Index_type ty = threadIdx.y; ty < Ny) \ + if (const Index_type tx = threadIdx.x; tx < Nx) #define GPU_SHARED_LOOP_2D(tx, ty, Nx, Ny) \ if (threadIdx.z < 1) \ - for (int ty = threadIdx.y; ty < Ny; ty += blockDim.y) \ - for (int tx = threadIdx.x; tx < Nx; tx += blockDim.x) + for (Index_type ty = threadIdx.y; ty < Ny; ty += blockDim.y) \ + for (Index_type tx = threadIdx.x; tx < Nx; tx += blockDim.x) #define GPU_SHARED_LOOP_3D(tx, ty, tz, Nx, Ny, Nz) \ - for (int tz = threadIdx.z; tz < Nz; tz += blockDim.z) \ - for (int ty = threadIdx.y; ty < Ny; ty += blockDim.y) \ - for (int tx = threadIdx.x; tx < Nx; tx += blockDim.x) + for (Index_type tz = threadIdx.z; tz < Nz; tz += blockDim.z) \ + for (Index_type ty = threadIdx.y; ty < Ny; ty += blockDim.y) \ + for (Index_type tx = threadIdx.x; tx < Nx; tx += blockDim.x) #define GPU_SHARED_LOOP_2D_INC(tx, ty, Nx, Ny, runtime_blocks_size) \ if (threadIdx.z < 1) \ - for (int ty = threadIdx.y; ty < Ny; ty += runtime_blocks_size) \ - for (int tx = threadIdx.x; tx < Nx; tx += runtime_blocks_size) + for (Index_type ty = threadIdx.y; ty < Ny; ty += runtime_blocks_size) \ + for (Index_type tx = threadIdx.x; tx < Nx; tx += runtime_blocks_size) #define GPU_SHARED_LOOP_3D_INC(tx, ty, tz, Nx, Ny, Nz, runtime_blocks_size) \ - for (int tz = threadIdx.z; tz < Nz; tz += runtime_blocks_size) \ - for (int ty = threadIdx.y; ty < Ny; ty += runtime_blocks_size) \ - for (int tx = threadIdx.x; tx < Nx; tx += runtime_blocks_size) + for (Index_type tz = threadIdx.z; tz < Nz; tz += runtime_blocks_size) \ + for (Index_type ty = threadIdx.y; ty < Ny; ty += runtime_blocks_size) \ + for (Index_type tx = threadIdx.x; tx < Nx; tx += runtime_blocks_size) #endif diff --git a/src/apps/MASS3DEA-Cuda.cpp b/src/apps/MASS3DEA-Cuda.cpp index 783ad4678..6a9c13e90 100644 --- a/src/apps/MASS3DEA-Cuda.cpp +++ b/src/apps/MASS3DEA-Cuda.cpp @@ -28,8 +28,8 @@ __global__ void Mass3DEA(const Real_ptr B, const Real_ptr D, Real_ptr M) { MASS3DEA_0 GPU_FOREACH_THREAD(iz, z, 1) { - GPU_FOREACH_THREAD(d, x, MEA_D1D) { - GPU_FOREACH_THREAD(q, y, MEA_Q1D) { + GPU_FOREACH_THREAD(d, x, mea::D1D) { + GPU_FOREACH_THREAD(q, y, mea::Q1D) { MASS3DEA_1 } } @@ -37,9 +37,9 @@ __global__ void Mass3DEA(const Real_ptr B, const Real_ptr D, Real_ptr M) { MASS3DEA_2 - GPU_FOREACH_THREAD(k1, x, MEA_Q1D) { - GPU_FOREACH_THREAD(k2, y, MEA_Q1D) { - GPU_FOREACH_THREAD(k3, z, MEA_Q1D) { + GPU_FOREACH_THREAD(k1, x, mea::Q1D) { + GPU_FOREACH_THREAD(k2, y, mea::Q1D) { + GPU_FOREACH_THREAD(k3, z, mea::Q1D) { MASS3DEA_3 } } @@ -47,14 +47,14 @@ __global__ void Mass3DEA(const Real_ptr B, const Real_ptr D, Real_ptr M) { __syncthreads(); - GPU_FOREACH_THREAD(i1, x, MEA_D1D) { - GPU_FOREACH_THREAD(i2, y, MEA_D1D) { - GPU_FOREACH_THREAD(i3, z, MEA_D1D) { + GPU_FOREACH_THREAD(i1, x, mea::D1D) { + GPU_FOREACH_THREAD(i2, y, mea::D1D) { + GPU_FOREACH_THREAD(i3, z, mea::D1D) { MASS3DEA_4 } } } - + } template < size_t block_size > @@ -75,7 +75,7 @@ void MASS3DEA::runCudaVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); + dim3 nthreads_per_block(mea::D1D, mea::D1D, mea::D1D); constexpr size_t shmem = 0; RPlaunchCudaKernel( (Mass3DEA), @@ -92,23 +92,24 @@ void MASS3DEA::runCudaVariantImpl(VariantID vid) { constexpr bool async = true; - using launch_policy = RAJA::LaunchPolicy>; + using launch_policy = RAJA::LaunchPolicy>; using outer_x = RAJA::LoopPolicy; - using inner_x = RAJA::LoopPolicy>; + using inner_x = RAJA::LoopPolicy>; - using inner_y = RAJA::LoopPolicy>; + using inner_y = RAJA::LoopPolicy>; - using inner_z = RAJA::LoopPolicy>; + using inner_z = RAJA::LoopPolicy>; startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MEA_D1D, MEA_D1D, MEA_D1D)), + RAJA::Threads(mea::D1D, mea::D1D, mea::D1D)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), @@ -118,9 +119,9 @@ void MASS3DEA::runCudaVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type ) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type q) { MASS3DEA_1 } @@ -133,11 +134,11 @@ void MASS3DEA::runCudaVariantImpl(VariantID vid) { MASS3DEA_2 - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k3) { MASS3DEA_3 } @@ -149,11 +150,11 @@ void MASS3DEA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i3) { MASS3DEA_4 } @@ -168,6 +169,7 @@ void MASS3DEA::runCudaVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASS3DEA-Hip.cpp b/src/apps/MASS3DEA-Hip.cpp index 8faef17c1..9ef50c930 100644 --- a/src/apps/MASS3DEA-Hip.cpp +++ b/src/apps/MASS3DEA-Hip.cpp @@ -28,8 +28,8 @@ __global__ void Mass3DEA(const Real_ptr B, const Real_ptr D, Real_ptr M) { MASS3DEA_0 GPU_FOREACH_THREAD(iz, z, 1) { - GPU_FOREACH_THREAD(d, x, MEA_D1D) { - GPU_FOREACH_THREAD(q, y, MEA_Q1D) { + GPU_FOREACH_THREAD(d, x, mea::D1D) { + GPU_FOREACH_THREAD(q, y, mea::Q1D) { MASS3DEA_1 } } @@ -37,9 +37,9 @@ __global__ void Mass3DEA(const Real_ptr B, const Real_ptr D, Real_ptr M) { MASS3DEA_2 - GPU_FOREACH_THREAD(k1, x, MEA_Q1D) { - GPU_FOREACH_THREAD(k2, y, MEA_Q1D) { - GPU_FOREACH_THREAD(k3, z, MEA_Q1D) { + GPU_FOREACH_THREAD(k1, x, mea::Q1D) { + GPU_FOREACH_THREAD(k2, y, mea::Q1D) { + GPU_FOREACH_THREAD(k3, z, mea::Q1D) { MASS3DEA_3 } } @@ -47,14 +47,14 @@ __global__ void Mass3DEA(const Real_ptr B, const Real_ptr D, Real_ptr M) { __syncthreads(); - GPU_FOREACH_THREAD(i1, x, MEA_D1D) { - GPU_FOREACH_THREAD(i2, y, MEA_D1D) { - GPU_FOREACH_THREAD(i3, z, MEA_D1D) { + GPU_FOREACH_THREAD(i1, x, mea::D1D) { + GPU_FOREACH_THREAD(i2, y, mea::D1D) { + GPU_FOREACH_THREAD(i3, z, mea::D1D) { MASS3DEA_4 } } } - + } template < size_t block_size > @@ -75,7 +75,7 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); + dim3 nthreads_per_block(mea::D1D, mea::D1D, mea::D1D); constexpr size_t shmem = 0; RPlaunchHipKernel( (Mass3DEA), @@ -92,23 +92,24 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { constexpr bool async = true; - using launch_policy = RAJA::LaunchPolicy>; + using launch_policy = RAJA::LaunchPolicy>; using outer_x = RAJA::LoopPolicy; - using inner_x = RAJA::LoopPolicy>; + using inner_x = RAJA::LoopPolicy>; - using inner_y = RAJA::LoopPolicy>; + using inner_y = RAJA::LoopPolicy>; - using inner_z = RAJA::LoopPolicy>; + using inner_z = RAJA::LoopPolicy>; startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MEA_D1D, MEA_D1D, MEA_D1D)), + RAJA::Threads(mea::D1D, mea::D1D, mea::D1D)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), @@ -118,9 +119,9 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type ) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type q) { MASS3DEA_1 } @@ -133,11 +134,11 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { MASS3DEA_2 - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k3) { MASS3DEA_3 } @@ -149,11 +150,11 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i3) { MASS3DEA_4 } @@ -168,6 +169,7 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASS3DEA-OMP.cpp b/src/apps/MASS3DEA-OMP.cpp index 26fa85e36..7fb126897 100644 --- a/src/apps/MASS3DEA-OMP.cpp +++ b/src/apps/MASS3DEA-OMP.cpp @@ -39,25 +39,25 @@ void MASS3DEA::runOpenMPVariant(VariantID vid) { MASS3DEA_0_CPU - CPU_FOREACH(d, x, MEA_D1D) { - CPU_FOREACH(q, y, MEA_Q1D) { + CPU_FOREACH(d, x, mea::D1D) { + CPU_FOREACH(q, y, mea::Q1D) { MASS3DEA_1 } } MASS3DEA_2_CPU - CPU_FOREACH(k1, x, MEA_Q1D) { - CPU_FOREACH(k2, y, MEA_Q1D) { - CPU_FOREACH(k3, z, MEA_Q1D) { + CPU_FOREACH(k1, x, mea::Q1D) { + CPU_FOREACH(k2, y, mea::Q1D) { + CPU_FOREACH(k3, z, mea::Q1D) { MASS3DEA_3 } } } - CPU_FOREACH(i1, x, MEA_D1D) { - CPU_FOREACH(i2, y, MEA_D1D) { - CPU_FOREACH(i3, z, MEA_D1D) { + CPU_FOREACH(i1, x, mea::D1D) { + CPU_FOREACH(i2, y, mea::D1D) { + CPU_FOREACH(i3, z, mea::D1D) { MASS3DEA_4 } } @@ -90,6 +90,7 @@ void MASS3DEA::runOpenMPVariant(VariantID vid) { for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { //Grid is empty as the host does not need a compute grid to be specified + //clang-format off RAJA::launch( res, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -101,9 +102,9 @@ void MASS3DEA::runOpenMPVariant(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type ) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type q) { MASS3DEA_1 } @@ -115,11 +116,11 @@ void MASS3DEA::runOpenMPVariant(VariantID vid) { MASS3DEA_2 - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k3) { MASS3DEA_3 } @@ -131,11 +132,11 @@ void MASS3DEA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i3) { MASS3DEA_4 } @@ -150,6 +151,7 @@ void MASS3DEA::runOpenMPVariant(VariantID vid) { } // outer lambda (ctx) ); // // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASS3DEA-Seq.cpp b/src/apps/MASS3DEA-Seq.cpp index 3e9bc1c99..17c191a84 100644 --- a/src/apps/MASS3DEA-Seq.cpp +++ b/src/apps/MASS3DEA-Seq.cpp @@ -36,25 +36,25 @@ void MASS3DEA::runSeqVariant(VariantID vid) MASS3DEA_0_CPU - CPU_FOREACH(d, x, MEA_D1D) { - CPU_FOREACH(q, y, MEA_Q1D) { + CPU_FOREACH(d, x, mea::D1D) { + CPU_FOREACH(q, y, mea::Q1D) { MASS3DEA_1 } } MASS3DEA_2_CPU - CPU_FOREACH(k1, x, MEA_Q1D) { - CPU_FOREACH(k2, y, MEA_Q1D) { - CPU_FOREACH(k3, z, MEA_Q1D) { + CPU_FOREACH(k1, x, mea::Q1D) { + CPU_FOREACH(k2, y, mea::Q1D) { + CPU_FOREACH(k3, z, mea::Q1D) { MASS3DEA_3 } } } - CPU_FOREACH(i1, x, MEA_D1D) { - CPU_FOREACH(i2, y, MEA_D1D) { - CPU_FOREACH(i3, z, MEA_D1D) { + CPU_FOREACH(i1, x, mea::D1D) { + CPU_FOREACH(i2, y, mea::D1D) { + CPU_FOREACH(i3, z, mea::D1D) { MASS3DEA_4 } } @@ -87,6 +87,7 @@ void MASS3DEA::runSeqVariant(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -98,9 +99,9 @@ void MASS3DEA::runSeqVariant(VariantID vid) RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type ) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type q) { MASS3DEA_1 } @@ -112,11 +113,11 @@ void MASS3DEA::runSeqVariant(VariantID vid) MASS3DEA_2 - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k3) { MASS3DEA_3 } @@ -128,11 +129,11 @@ void MASS3DEA::runSeqVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i3) { MASS3DEA_4 } @@ -146,6 +147,7 @@ void MASS3DEA::runSeqVariant(VariantID vid) ); // RAJA::loop } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASS3DEA-Sycl.cpp b/src/apps/MASS3DEA-Sycl.cpp index c9b5fbc26..7fb200fd0 100644 --- a/src/apps/MASS3DEA-Sycl.cpp +++ b/src/apps/MASS3DEA-Sycl.cpp @@ -34,8 +34,8 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { case Base_SYCL: { - const ::sycl::range<3> workGroupSize(MEA_Q1D, MEA_Q1D, MEA_Q1D); - const ::sycl::range<3> gridSize(MEA_Q1D,MEA_Q1D,MEA_Q1D*NE); + const ::sycl::range<3> workGroupSize(mea::Q1D, mea::Q1D, mea::Q1D); + const ::sycl::range<3> gridSize(mea::Q1D,mea::Q1D,mea::Q1D*NE); startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning @@ -44,8 +44,8 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { constexpr size_t shmem = 0; qu->submit([&](::sycl::handler& h) { - ::sycl::local_accessor s_B(::sycl::range<2>(MEA_Q1D,MEA_D1D),h); - ::sycl::local_accessor s_D(::sycl::range<3>(MEA_Q1D,MEA_Q1D,MEA_Q1D),h); + ::sycl::local_accessor s_B(::sycl::range<2>(mea::Q1D,mea::D1D),h); + ::sycl::local_accessor s_D(::sycl::range<3>(mea::Q1D,mea::Q1D,mea::Q1D),h); h.parallel_for (::sycl::nd_range<3>(gridSize, workGroupSize), @@ -54,8 +54,8 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { const Index_type e = itm.get_group(2); SYCL_FOREACH_THREAD(iz, 0, 1) { - SYCL_FOREACH_THREAD(d, 2, MEA_D1D) { - SYCL_FOREACH_THREAD(q, 1, MEA_Q1D) { + SYCL_FOREACH_THREAD(d, 2, mea::D1D) { + SYCL_FOREACH_THREAD(q, 1, mea::Q1D) { MASS3DEA_1 } } @@ -64,9 +64,9 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { //not needed as we dynamicaly allocate shared memory in sycl //MASS3DEA_2 - SYCL_FOREACH_THREAD(k1, 2, MEA_Q1D) { - SYCL_FOREACH_THREAD(k2, 1, MEA_Q1D) { - SYCL_FOREACH_THREAD(k3, 0, MEA_Q1D) { + SYCL_FOREACH_THREAD(k1, 2, mea::Q1D) { + SYCL_FOREACH_THREAD(k2, 1, mea::Q1D) { + SYCL_FOREACH_THREAD(k3, 0, mea::Q1D) { MASS3DEA_3 } } @@ -74,9 +74,9 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(i1, 2, MEA_D1D) { - SYCL_FOREACH_THREAD(i2, 1, MEA_D1D) { - SYCL_FOREACH_THREAD(i3, 0, MEA_D1D) { + SYCL_FOREACH_THREAD(i1, 2, mea::D1D) { + SYCL_FOREACH_THREAD(i2, 1, mea::D1D) { + SYCL_FOREACH_THREAD(i3, 0, mea::D1D) { MASS3DEA_4 } } @@ -105,31 +105,32 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { using inner_z = RAJA::LoopPolicy; - constexpr size_t shmem = (MEA_Q1D*MEA_D1D + MEA_Q1D*MEA_Q1D*MEA_Q1D)*sizeof(Real_type); + constexpr size_t shmem = (mea::Q1D*mea::D1D + mea::Q1D*mea::Q1D*mea::Q1D)*sizeof(Real_type); startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MEA_D1D, MEA_D1D, MEA_D1D), shmem), + RAJA::Threads(mea::D1D, mea::D1D, mea::D1D), shmem), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), [&](Index_type e) { - Real_ptr s_B_ptr = ctx.getSharedMemory(MEA_Q1D*MEA_D1D); - Real_ptr s_D_ptr = ctx.getSharedMemory(MEA_Q1D*MEA_Q1D*MEA_Q1D); + Real_ptr s_B_ptr = ctx.getSharedMemory(mea::Q1D*mea::D1D); + Real_ptr s_D_ptr = ctx.getSharedMemory(mea::Q1D*mea::Q1D*mea::Q1D); - Real_type (*s_B)[MEA_D1D] = (Real_type (*)[MEA_D1D]) s_B_ptr; - Real_type (*s_D)[MEA_Q1D][MEA_Q1D] = (Real_type (*)[MEA_Q1D][MEA_Q1D]) s_B_ptr; + Real_type (*s_B)[mea::D1D] = (Real_type (*)[mea::D1D]) s_B_ptr; + Real_type (*s_D)[mea::Q1D][mea::Q1D] = (Real_type (*)[mea::Q1D][mea::Q1D]) s_B_ptr; RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type ) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type q) { MASS3DEA_1 } @@ -142,11 +143,11 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { //not needed as we dynamicaly allocate shared memory in sycl //MASS3DEA_2 - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::Q1D), [&](Index_type k3) { MASS3DEA_3 } @@ -158,11 +159,11 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i1) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i2) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mea::D1D), [&](Index_type i3) { MASS3DEA_4 } @@ -177,6 +178,7 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp index ba2e0832d..65b59b53d 100644 --- a/src/apps/MASS3DEA.cpp +++ b/src/apps/MASS3DEA.cpp @@ -25,20 +25,20 @@ MASS3DEA::MASS3DEA(const RunParams& params) { m_NE_default = 8000; - setDefaultProblemSize(m_NE_default*MEA_Q1D*MEA_Q1D*MEA_Q1D); + setDefaultProblemSize(m_NE_default*mea::D1D*mea::D1D*mea::D1D); setDefaultReps(1); - const Index_type ea_mat_entries = MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D; + const Index_type ea_mat_entries = mea::D1D*mea::D1D*mea::D1D*mea::D1D*mea::D1D*mea::D1D; m_NE = std::max((getTargetProblemSize() + (ea_mat_entries)/2) / (ea_mat_entries), Index_type(1)); setActualProblemSize( m_NE*ea_mat_entries ); - setItsPerRep( m_NE*MEA_Q1D*MEA_Q1D*MEA_Q1D ); + setItsPerRep( m_NE*mea::D1D*mea::D1D*mea::D1D ); setKernelsPerRep(1); - setBytesReadPerRep( 1*sizeof(Real_type) * MEA_Q1D*MEA_D1D + // B - 1*sizeof(Real_type) * MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE ); // D + setBytesReadPerRep( 1*sizeof(Real_type) * mea::Q1D*mea::D1D + // B + 1*sizeof(Real_type) * mea::Q1D*mea::Q1D*mea::Q1D*m_NE ); // D setBytesWrittenPerRep( 1*sizeof(Real_type) * ea_mat_entries*m_NE ); // M_e setBytesModifyWrittenPerRep( 0 ); setBytesAtomicModifyWrittenPerRep( 0 ); @@ -62,16 +62,15 @@ MASS3DEA::~MASS3DEA() void MASS3DEA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - allocAndInitDataConst(m_B, Index_type(MEA_Q1D*MEA_D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_D, Index_type(MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_M, Index_type(MEA_D1D*MEA_D1D*MEA_D1D* - MEA_D1D*MEA_D1D*MEA_D1D*m_NE), Real_type(0.0), vid); + allocAndInitDataConst(m_B, Index_type(mea::Q1D*mea::D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_D, Index_type(mea::Q1D*mea::Q1D*mea::Q1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_M, Index_type(mea::D1D*mea::D1D*mea::D1D* + mea::D1D*mea::D1D*mea::D1D*m_NE), Real_type(0.0), vid); } void MASS3DEA::updateChecksum(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - addToChecksum(m_M, MEA_D1D*MEA_D1D*MEA_D1D* - MEA_D1D*MEA_D1D*MEA_D1D*m_NE, vid); + addToChecksum(m_M, mea::D1D*mea::D1D*mea::D1D*mea::D1D*mea::D1D*mea::D1D*m_NE, vid); } void MASS3DEA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp index 0e0b44b7f..868891966 100644 --- a/src/apps/MASS3DEA.hpp +++ b/src/apps/MASS3DEA.hpp @@ -10,47 +10,48 @@ /// Assembly of 3D mass matrix /// /// Based on MFEM's/CEED algorithms. -/// Reference implementation -/// https://github.com/mfem/mfem/blob/master/fem/integ/bilininteg_mass_ea.cpp#L142 +/// Reference implementation - MFEM-v4.9 +/// https://github.com/mfem/mfem/blob/v4.9/fem/integ/bilininteg_mass_kernels.hpp#L1268 +/// Kernel uses shared memory which is optimal for orders higher than 2 /// -/// for (int e = 0; e < NE; ++e) +/// for (Index_type e = 0; e < NE; ++e) /// { /// -/// double s_B[MQ1s][MD1s]; -/// double r_B[MQ1r][MD1r]; +/// Real_type s_B[MQ1s][MD1s]; +/// Real_type r_B[MQ1r][MD1r]; /// -/// double (*l_B)[MD1] = nullptr; +/// Real_type (*l_B)[MD1] = nullptr; /// -/// for(int d=0; d - void runCudaVariantImpl(VariantID vid); - template - void runHipVariantImpl(VariantID vid); - template - void runSyclVariantImpl(VariantID vid); + template void runCudaVariantImpl(VariantID vid); + template void runHipVariantImpl(VariantID vid); + template void runSyclVariantImpl(VariantID vid); private: - static const size_t default_gpu_block_size = MEA_D1D * MEA_D1D * MEA_D1D; - using gpu_block_sizes_type = - integer::list_type; + static const size_t default_gpu_block_size = mea::D1D * mea::D1D * mea::D1D; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index b5c696746..b13395c0f 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -31,57 +31,57 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, MASS3DPA_0_GPU - GPU_FOREACH_THREAD(dy, y, MPA_D1D) { - GPU_FOREACH_THREAD(dx, x, MPA_D1D){ + GPU_FOREACH_THREAD(dy, y, mpa::D1D) { + GPU_FOREACH_THREAD(dx, x, mpa::D1D){ MASS3DPA_1 } - GPU_FOREACH_THREAD(dx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, mpa::Q1D) { MASS3DPA_2 } } __syncthreads(); - GPU_FOREACH_THREAD(dy, y, MPA_D1D) { - GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(dy, y, mpa::D1D) { + GPU_FOREACH_THREAD(qx, x, mpa::Q1D) { MASS3DPA_3 } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, mpa::Q1D) { + GPU_FOREACH_THREAD(qx, x, mpa::Q1D) { MASS3DPA_4 } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, mpa::Q1D) { + GPU_FOREACH_THREAD(qx, x, mpa::Q1D) { MASS3DPA_5 } } __syncthreads(); - GPU_FOREACH_THREAD(d, y, MPA_D1D) { - GPU_FOREACH_THREAD(q, x, MPA_Q1D) { + GPU_FOREACH_THREAD(d, y, mpa::D1D) { + GPU_FOREACH_THREAD(q, x, mpa::Q1D) { MASS3DPA_6 } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { - GPU_FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(qy, y, mpa::Q1D) { + GPU_FOREACH_THREAD(dx, x, mpa::D1D) { MASS3DPA_7 } } __syncthreads(); - GPU_FOREACH_THREAD(dy, y, MPA_D1D) { - GPU_FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(dy, y, mpa::D1D) { + GPU_FOREACH_THREAD(dx, x, mpa::D1D) { MASS3DPA_8 } } __syncthreads(); - GPU_FOREACH_THREAD(dy, y, MPA_D1D) { - GPU_FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(dy, y, mpa::D1D) { + GPU_FOREACH_THREAD(dx, x, mpa::D1D) { MASS3DPA_9 } } @@ -105,7 +105,7 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); + dim3 nthreads_per_block(mpa::Q1D, mpa::Q1D, 1); constexpr size_t shmem = 0; RPlaunchCudaKernel( (Mass3DPA), @@ -122,21 +122,22 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { constexpr bool async = true; - using launch_policy = RAJA::LaunchPolicy>; + using launch_policy = RAJA::LaunchPolicy>; using outer_x = RAJA::LoopPolicy; - using inner_x = RAJA::LoopPolicy>; + using inner_x = RAJA::LoopPolicy>; - using inner_y = RAJA::LoopPolicy>; + using inner_y = RAJA::LoopPolicy>; startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MPA_Q1D, MPA_Q1D, 1)), + RAJA::Threads(mpa::Q1D, mpa::Q1D, 1)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), @@ -144,15 +145,15 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { MASS3DPA_0_GPU - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_1 } ); // RAJA::loop - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type dx) { MASS3DPA_2 } @@ -162,9 +163,9 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_3 } @@ -174,9 +175,9 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_4 } @@ -186,9 +187,9 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_5 } @@ -198,9 +199,9 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type q) { MASS3DPA_6 } @@ -210,9 +211,9 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_7 } @@ -222,9 +223,9 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_8 } @@ -234,9 +235,9 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_9 } @@ -249,6 +250,7 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index d2ab21cc1..ca8e89da1 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -31,57 +31,57 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, MASS3DPA_0_GPU - GPU_FOREACH_THREAD(dy, y, MPA_D1D) { - GPU_FOREACH_THREAD(dx, x, MPA_D1D){ + GPU_FOREACH_THREAD(dy, y, mpa::D1D) { + GPU_FOREACH_THREAD(dx, x, mpa::D1D){ MASS3DPA_1 } - GPU_FOREACH_THREAD(dx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, mpa::Q1D) { MASS3DPA_2 } } __syncthreads(); - GPU_FOREACH_THREAD(dy, y, MPA_D1D) { - GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(dy, y, mpa::D1D) { + GPU_FOREACH_THREAD(qx, x, mpa::Q1D) { MASS3DPA_3 } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, mpa::Q1D) { + GPU_FOREACH_THREAD(qx, x, mpa::Q1D) { MASS3DPA_4 } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, mpa::Q1D) { + GPU_FOREACH_THREAD(qx, x, mpa::Q1D) { MASS3DPA_5 } } __syncthreads(); - GPU_FOREACH_THREAD(d, y, MPA_D1D) { - GPU_FOREACH_THREAD(q, x, MPA_Q1D) { + GPU_FOREACH_THREAD(d, y, mpa::D1D) { + GPU_FOREACH_THREAD(q, x, mpa::Q1D) { MASS3DPA_6 } } __syncthreads(); - GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { - GPU_FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(qy, y, mpa::Q1D) { + GPU_FOREACH_THREAD(dx, x, mpa::D1D) { MASS3DPA_7 } } __syncthreads(); - GPU_FOREACH_THREAD(dy, y, MPA_D1D) { - GPU_FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(dy, y, mpa::D1D) { + GPU_FOREACH_THREAD(dx, x, mpa::D1D) { MASS3DPA_8 } } __syncthreads(); - GPU_FOREACH_THREAD(dy, y, MPA_D1D) { - GPU_FOREACH_THREAD(dx, x, MPA_D1D) { + GPU_FOREACH_THREAD(dy, y, mpa::D1D) { + GPU_FOREACH_THREAD(dx, x, mpa::D1D) { MASS3DPA_9 } } @@ -105,7 +105,7 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); + dim3 nthreads_per_block(mpa::Q1D, mpa::Q1D, 1); constexpr size_t shmem = 0; RPlaunchHipKernel( (Mass3DPA), @@ -123,36 +123,37 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { constexpr bool async = true; - using launch_policy = RAJA::LaunchPolicy>; + using launch_policy = RAJA::LaunchPolicy>; using outer_x = RAJA::LoopPolicy; - using inner_x = RAJA::LoopPolicy>; + using inner_x = RAJA::LoopPolicy>; - using inner_y = RAJA::LoopPolicy>; + using inner_y = RAJA::LoopPolicy>; startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MPA_Q1D, MPA_Q1D, 1)), + RAJA::Threads(mpa::Q1D, mpa::Q1D, 1)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), [&](Index_type e) { MASS3DPA_0_GPU - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_1 } ); // RAJA::loop - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type dx) { MASS3DPA_2 } @@ -162,9 +163,9 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_3 } @@ -174,9 +175,9 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_4 } @@ -186,9 +187,9 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_5 } @@ -198,9 +199,9 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type q) { MASS3DPA_6 } @@ -210,9 +211,9 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_7 } @@ -222,9 +223,9 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_8 } @@ -234,9 +235,9 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_9 } @@ -249,6 +250,7 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index 70ab844d4..0db13cd1f 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -39,53 +39,53 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { MASS3DPA_0_CPU - CPU_FOREACH(dy, y, MPA_D1D) { - CPU_FOREACH(dx, x, MPA_D1D){ + CPU_FOREACH(dy, y, mpa::D1D) { + CPU_FOREACH(dx, x, mpa::D1D){ MASS3DPA_1 } - CPU_FOREACH(dx, x, MPA_Q1D) { + CPU_FOREACH(dx, x, mpa::Q1D) { MASS3DPA_2 } } - CPU_FOREACH(dy, y, MPA_D1D) { - CPU_FOREACH(qx, x, MPA_Q1D) { + CPU_FOREACH(dy, y, mpa::D1D) { + CPU_FOREACH(qx, x, mpa::Q1D) { MASS3DPA_3 } } - CPU_FOREACH(qy, y, MPA_Q1D) { - CPU_FOREACH(qx, x, MPA_Q1D) { + CPU_FOREACH(qy, y, mpa::Q1D) { + CPU_FOREACH(qx, x, mpa::Q1D) { MASS3DPA_4 } } - CPU_FOREACH(qy, y, MPA_Q1D) { - CPU_FOREACH(qx, x, MPA_Q1D) { + CPU_FOREACH(qy, y, mpa::Q1D) { + CPU_FOREACH(qx, x, mpa::Q1D) { MASS3DPA_5 } } - CPU_FOREACH(d, y, MPA_D1D) { - CPU_FOREACH(q, x, MPA_Q1D) { + CPU_FOREACH(d, y, mpa::D1D) { + CPU_FOREACH(q, x, mpa::Q1D) { MASS3DPA_6 } } - CPU_FOREACH(qy, y, MPA_Q1D) { - CPU_FOREACH(dx, x, MPA_D1D) { + CPU_FOREACH(qy, y, mpa::Q1D) { + CPU_FOREACH(dx, x, mpa::D1D) { MASS3DPA_7 } } - CPU_FOREACH(dy, y, MPA_D1D) { - CPU_FOREACH(dx, x, MPA_D1D) { + CPU_FOREACH(dy, y, mpa::D1D) { + CPU_FOREACH(dx, x, mpa::D1D) { MASS3DPA_8 } } - CPU_FOREACH(dy, y, MPA_D1D) { - CPU_FOREACH(dx, x, MPA_D1D) { + CPU_FOREACH(dy, y, mpa::D1D) { + CPU_FOREACH(dx, x, mpa::D1D) { MASS3DPA_9 } } @@ -114,6 +114,7 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { //Grid is empty as the host does not need a compute grid to be specified + //clang-format off RAJA::launch( res, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -123,15 +124,15 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { MASS3DPA_0_CPU - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_1 } ); // RAJA::loop - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type dx) { MASS3DPA_2 } @@ -141,9 +142,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_3 } @@ -153,9 +154,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_4 } @@ -165,9 +166,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_5 } @@ -177,9 +178,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type q) { MASS3DPA_6 } @@ -189,9 +190,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_7 } @@ -201,9 +202,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_8 } @@ -213,9 +214,9 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_9 } @@ -228,6 +229,7 @@ void MASS3DPA::runOpenMPVariant(VariantID vid) { } // outer lambda (ctx) ); // // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index acee95f30..17aea0cd5 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -36,53 +36,53 @@ void MASS3DPA::runSeqVariant(VariantID vid) { MASS3DPA_0_CPU - CPU_FOREACH(dy, y, MPA_D1D) { - CPU_FOREACH(dx, x, MPA_D1D){ + CPU_FOREACH(dy, y, mpa::D1D) { + CPU_FOREACH(dx, x, mpa::D1D){ MASS3DPA_1 } - CPU_FOREACH(dx, x, MPA_Q1D) { + CPU_FOREACH(dx, x, mpa::Q1D) { MASS3DPA_2 } } - CPU_FOREACH(dy, y, MPA_D1D) { - CPU_FOREACH(qx, x, MPA_Q1D) { + CPU_FOREACH(dy, y, mpa::D1D) { + CPU_FOREACH(qx, x, mpa::Q1D) { MASS3DPA_3 } } - CPU_FOREACH(qy, y, MPA_Q1D) { - CPU_FOREACH(qx, x, MPA_Q1D) { + CPU_FOREACH(qy, y, mpa::Q1D) { + CPU_FOREACH(qx, x, mpa::Q1D) { MASS3DPA_4 } } - CPU_FOREACH(qy, y, MPA_Q1D) { - CPU_FOREACH(qx, x, MPA_Q1D) { + CPU_FOREACH(qy, y, mpa::Q1D) { + CPU_FOREACH(qx, x, mpa::Q1D) { MASS3DPA_5 } } - CPU_FOREACH(d, y, MPA_D1D) { - CPU_FOREACH(q, x, MPA_Q1D) { + CPU_FOREACH(d, y, mpa::D1D) { + CPU_FOREACH(q, x, mpa::Q1D) { MASS3DPA_6 } } - CPU_FOREACH(qy, y, MPA_Q1D) { - CPU_FOREACH(dx, x, MPA_D1D) { + CPU_FOREACH(qy, y, mpa::Q1D) { + CPU_FOREACH(dx, x, mpa::D1D) { MASS3DPA_7 } } - CPU_FOREACH(dy, y, MPA_D1D) { - CPU_FOREACH(dx, x, MPA_D1D) { + CPU_FOREACH(dy, y, mpa::D1D) { + CPU_FOREACH(dx, x, mpa::D1D) { MASS3DPA_8 } } - CPU_FOREACH(dy, y, MPA_D1D) { - CPU_FOREACH(dx, x, MPA_D1D) { + CPU_FOREACH(dy, y, mpa::D1D) { + CPU_FOREACH(dx, x, mpa::D1D) { MASS3DPA_9 } } @@ -112,6 +112,7 @@ void MASS3DPA::runSeqVariant(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -121,15 +122,15 @@ void MASS3DPA::runSeqVariant(VariantID vid) { MASS3DPA_0_CPU - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_1 } ); // RAJA::loop - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type dx) { MASS3DPA_2 } @@ -139,9 +140,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_3 } @@ -151,9 +152,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_4 } @@ -163,9 +164,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_5 } @@ -175,9 +176,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type q) { MASS3DPA_6 } @@ -187,9 +188,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_7 } @@ -199,9 +200,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_8 } @@ -211,9 +212,9 @@ void MASS3DPA::runSeqVariant(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_9 } @@ -226,6 +227,7 @@ void MASS3DPA::runSeqVariant(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASS3DPA-Sycl.cpp b/src/apps/MASS3DPA-Sycl.cpp index 5ac06c4e0..fc2ff5153 100644 --- a/src/apps/MASS3DPA-Sycl.cpp +++ b/src/apps/MASS3DPA-Sycl.cpp @@ -33,8 +33,8 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { MASS3DPA_DATA_SETUP; - const ::sycl::range<3> workGroupSize(1, MPA_Q1D, MPA_Q1D); - const ::sycl::range<3> gridSize(1, MPA_Q1D, MPA_Q1D*NE); + const ::sycl::range<3> workGroupSize(1, mpa::Q1D, mpa::Q1D); + const ::sycl::range<3> gridSize(1, mpa::Q1D, mpa::Q1D*NE); switch (vid) { @@ -46,8 +46,8 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { qu->submit([&](::sycl::handler& h) { - constexpr Index_type MQ1 = MPA_Q1D; - constexpr Index_type MD1 = MPA_D1D; + constexpr Index_type MQ1 = mpa::Q1D; + constexpr Index_type MD1 = mpa::D1D; constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; auto sDQ_vec = ::sycl::local_accessor(::sycl::range<1>(MQ1 * MD1), h); @@ -74,57 +74,57 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; - SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { - SYCL_FOREACH_THREAD(dx, 2, MPA_D1D){ + SYCL_FOREACH_THREAD(dy, 1, mpa::D1D) { + SYCL_FOREACH_THREAD(dx, 2, mpa::D1D){ MASS3DPA_1 } - SYCL_FOREACH_THREAD(dx, 2, MPA_Q1D) { + SYCL_FOREACH_THREAD(dx, 2, mpa::Q1D) { MASS3DPA_2 } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { - SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) { + SYCL_FOREACH_THREAD(dy, 1, mpa::D1D) { + SYCL_FOREACH_THREAD(qx, 2, mpa::Q1D) { MASS3DPA_3 } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { - SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) { + SYCL_FOREACH_THREAD(qy, 1, mpa::Q1D) { + SYCL_FOREACH_THREAD(qx, 2, mpa::Q1D) { MASS3DPA_4 } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { - SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) { + SYCL_FOREACH_THREAD(qy, 1, mpa::Q1D) { + SYCL_FOREACH_THREAD(qx, 2, mpa::Q1D) { MASS3DPA_5 } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(d, 1, MPA_D1D) { - SYCL_FOREACH_THREAD(q, 2, MPA_Q1D) { + SYCL_FOREACH_THREAD(d, 1, mpa::D1D) { + SYCL_FOREACH_THREAD(q, 2, mpa::Q1D) { MASS3DPA_6 } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { - SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) { + SYCL_FOREACH_THREAD(qy, 1, mpa::Q1D) { + SYCL_FOREACH_THREAD(dx, 2, mpa::D1D) { MASS3DPA_7 } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { - SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) { + SYCL_FOREACH_THREAD(dy, 1, mpa::D1D) { + SYCL_FOREACH_THREAD(dx, 2, mpa::D1D) { MASS3DPA_8 } } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { - SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) { + SYCL_FOREACH_THREAD(dy, 1, mpa::D1D) { + SYCL_FOREACH_THREAD(dx, 2, mpa::D1D) { MASS3DPA_9 } } @@ -153,8 +153,8 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { //Caclulate amount of shared memory needed size_t shmem = 0; { - constexpr Index_type MQ1 = MPA_Q1D; - constexpr Index_type MD1 = MPA_D1D; + constexpr Index_type MQ1 = mpa::Q1D; + constexpr Index_type MD1 = mpa::D1D; constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; constexpr Index_type no_mats = 2; @@ -165,17 +165,18 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MPA_Q1D, MPA_Q1D), shmem), + RAJA::Threads(mpa::Q1D, mpa::Q1D), shmem), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), [&](Index_type e) { //Redefine inside the lambda to keep consistent with base version - constexpr Index_type MQ1 = MPA_Q1D; - constexpr Index_type MD1 = MPA_D1D; + constexpr Index_type MQ1 = mpa::Q1D; + constexpr Index_type MD1 = mpa::D1D; constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; Real_ptr sDQ = ctx.getSharedMemory(MQ1 * MD1); @@ -192,15 +193,15 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_1 } ); // RAJA::loop - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type dx) { MASS3DPA_2 } @@ -210,9 +211,9 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_3 } @@ -222,9 +223,9 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_4 } @@ -234,9 +235,9 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { MASS3DPA_5 } @@ -246,9 +247,9 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type q) { MASS3DPA_6 } @@ -258,9 +259,9 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_7 } @@ -270,9 +271,9 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_8 } @@ -282,9 +283,9 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { MASS3DPA_9 } @@ -297,6 +298,7 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index a256a7187..9ee41e958 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -25,29 +25,29 @@ MASS3DPA::MASS3DPA(const RunParams& params) { m_NE_default = 8000; - setDefaultProblemSize(m_NE_default*MPA_Q1D*MPA_Q1D*MPA_Q1D); + setDefaultProblemSize(m_NE_default*mpa::D1D*mpa::D1D*mpa::D1D); setDefaultReps(50); - m_NE = std::max((getTargetProblemSize() + (MPA_Q1D*MPA_Q1D*MPA_Q1D)/2) / (MPA_Q1D*MPA_Q1D*MPA_Q1D), Index_type(1)); + m_NE = std::max((getTargetProblemSize() + (mpa::D1D*mpa::D1D*mpa::D1D)/2) / (mpa::D1D*mpa::D1D*mpa::D1D), Index_type(1)); - setActualProblemSize( m_NE*MPA_Q1D*MPA_Q1D*MPA_Q1D ); + setActualProblemSize( m_NE*mpa::D1D*mpa::D1D*mpa::D1D ); - setItsPerRep( m_NE*MPA_Q1D*MPA_Q1D ); + setItsPerRep( m_NE*mpa::D1D*mpa::D1D ); setKernelsPerRep(1); - setBytesReadPerRep( 2*sizeof(Real_type) * MPA_Q1D*MPA_D1D + // B, Bt - 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE + // X - 1*sizeof(Real_type) * MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE ); // D + setBytesReadPerRep( 2*sizeof(Real_type) * mpa::Q1D*mpa::D1D + // B, Bt + 1*sizeof(Real_type) * mpa::D1D*mpa::D1D*mpa::D1D*m_NE + // X + 1*sizeof(Real_type) * mpa::Q1D*mpa::Q1D*mpa::Q1D*m_NE ); // D setBytesWrittenPerRep( 0 ); - setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE ); // Y + setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * mpa::D1D*mpa::D1D*mpa::D1D*m_NE ); // Y setBytesAtomicModifyWrittenPerRep( 0 ); - setFLOPsPerRep(m_NE * (2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D + - 2 * MPA_D1D * MPA_D1D * MPA_Q1D * MPA_Q1D + - 2 * MPA_D1D * MPA_Q1D * MPA_Q1D * MPA_Q1D + MPA_Q1D * MPA_Q1D * MPA_Q1D + - 2 * MPA_Q1D * MPA_Q1D * MPA_Q1D * MPA_D1D + - 2 * MPA_Q1D * MPA_Q1D * MPA_D1D * MPA_D1D + - 2 * MPA_Q1D * MPA_D1D * MPA_D1D * MPA_D1D + MPA_D1D * MPA_D1D * MPA_D1D)); + setFLOPsPerRep(m_NE * (2 * mpa::D1D * mpa::D1D * mpa::D1D * mpa::Q1D + + 2 * mpa::D1D * mpa::D1D * mpa::Q1D * mpa::Q1D + + 2 * mpa::D1D * mpa::Q1D * mpa::Q1D * mpa::Q1D + mpa::Q1D * mpa::Q1D * mpa::Q1D + + 2 * mpa::Q1D * mpa::Q1D * mpa::Q1D * mpa::D1D + + 2 * mpa::Q1D * mpa::Q1D * mpa::D1D * mpa::D1D + + 2 * mpa::Q1D * mpa::D1D * mpa::D1D * mpa::D1D + mpa::D1D * mpa::D1D * mpa::D1D)); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); setChecksumTolerance(ChecksumTolerance::normal); @@ -66,16 +66,16 @@ MASS3DPA::~MASS3DPA() void MASS3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - allocAndInitDataConst(m_B, Index_type(MPA_Q1D*MPA_D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_Bt,Index_type(MPA_Q1D*MPA_D1D), Real_type(1.0), vid); - allocAndInitDataConst(m_D, Index_type(MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_X, Index_type(MPA_D1D*MPA_D1D*MPA_D1D*m_NE), Real_type(1.0), vid); - allocAndInitDataConst(m_Y, Index_type(MPA_D1D*MPA_D1D*MPA_D1D*m_NE), Real_type(0.0), vid); + allocAndInitDataConst(m_B, Index_type(mpa::Q1D*mpa::D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_Bt,Index_type(mpa::Q1D*mpa::D1D), Real_type(1.0), vid); + allocAndInitDataConst(m_D, Index_type(mpa::Q1D*mpa::Q1D*mpa::Q1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_X, Index_type(mpa::D1D*mpa::D1D*mpa::D1D*m_NE), Real_type(1.0), vid); + allocAndInitDataConst(m_Y, Index_type(mpa::D1D*mpa::D1D*mpa::D1D*m_NE), Real_type(0.0), vid); } void MASS3DPA::updateChecksum(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - addToChecksum(m_Y, MPA_D1D*MPA_D1D*MPA_D1D*m_NE, vid); + addToChecksum(m_Y, mpa::D1D*mpa::D1D*mpa::D1D*m_NE, vid); } void MASS3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index a7741b907..1f498526e 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -10,140 +10,140 @@ /// Action of 3D mass matrix via partial assembly /// /// Based on MFEM's/CEED algorithms. -/// Reference implementation -/// https://github.com/mfem/mfem/blob/master/fem/bilininteg_mass_pa.cpp#L925 +/// Reference implementation - MFEM-v4.9 +/// https://github.com/mfem/mfem/blob/v4.9/fem/integ/bilininteg_mass_kernels.hpp#L809 /// -/// for (int e = 0; e < NE; ++e) { +/// for (Index_type e = 0; e < NE; ++e) { /// -/// constexpr int MQ1 = MPA_Q1D; -/// constexpr int MD1 = MPA_D1D; -/// constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; -/// double sDQ[MQ1 * MD1]; -/// double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; -/// double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; -/// double sm0[MDQ * MDQ * MDQ]; -/// double sm1[MDQ * MDQ * MDQ]; -/// double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; -/// double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; -/// double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; -/// double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; -/// double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; -/// double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; +/// constexpr Index_type MQ1 = mpa::Q1D; +/// constexpr Index_type MD1 = mpa::D1D; +/// constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; +/// Real_type sDQ[MQ1 * MD1]; +/// Real_type(*Bsmem)[MD1] = (Real_type(*)[MD1])sDQ; +/// Real_type(*Btsmem)[MQ1] = (Real_type(*)[MQ1])sDQ; +/// Real_type sm0[MDQ * MDQ * MDQ]; +/// Real_type sm1[MDQ * MDQ * MDQ]; +/// Real_type(*Xsmem)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; +/// Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; +/// Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; +/// Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; +/// Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; +/// Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; /// -/// for(int dy=0; dy MD1) ? MQ1 : MD1; \ - double sDQ[MQ1 * MD1]; \ - double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ - double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; \ - double sm0[MDQ * MDQ * MDQ]; \ - double sm1[MDQ * MDQ * MDQ]; \ - double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; \ - double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; \ - double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; \ - double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; \ - double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; \ - double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; - -#define MASS3DPA_0_GPU \ - constexpr int MQ1 = MPA_Q1D; \ - constexpr int MD1 = MPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED double sDQ[MQ1 * MD1]; \ - double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ - double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; \ - RAJA_TEAM_SHARED double sm0[MDQ * MDQ * MDQ]; \ - RAJA_TEAM_SHARED double sm1[MDQ * MDQ * MDQ]; \ - double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; \ - double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; \ - double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; \ - double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; \ - double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; \ - double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; - -#define MASS3DPA_1 \ - RAJAPERF_UNROLL(MD1) \ -for (Index_type dz = 0; dz< MPA_D1D; ++dz) { \ -Xsmem[dz][dy][dx] = MPA_X(dx, dy, dz, e); \ -} - -#define MASS3DPA_2 \ - Bsmem[dx][dy] = MPA_B(dx, dy); - -// 2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D -#define MASS3DPA_3 \ - Real_type u[MPA_D1D]; \ -RAJAPERF_UNROLL(MD1) \ -for (Index_type dz = 0; dz < MPA_D1D; dz++) { \ -u[dz] = 0; \ -} \ -RAJAPERF_UNROLL(MD1) \ -for (Index_type dx = 0; dx < MPA_D1D; ++dx) { \ -RAJAPERF_UNROLL(MD1) \ -for (Index_type dz = 0; dz < MPA_D1D; ++dz) { \ -u[dz] += Xsmem[dz][dy][dx] * Bsmem[qx][dx]; \ -} \ -} \ -RAJAPERF_UNROLL(MD1) \ -for (Index_type dz = 0; dz < MPA_D1D; ++dz) { \ -DDQ[dz][dy][qx] = u[dz]; \ -} - -//2 * MPA_D1D * MPA_D1D * MPA_Q1D * MPA_Q1D -#define MASS3DPA_4 \ - Real_type u[MPA_D1D]; \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dz = 0; dz < MPA_D1D; dz++) { \ - u[dz] = 0; \ - } \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dy = 0; dy < MPA_D1D; ++dy) { \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dz = 0; dz < MPA_D1D; dz++) { \ - u[dz] += DDQ[dz][dy][qx] * Bsmem[qy][dy]; \ - } \ - } \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dz = 0; dz < MPA_D1D; dz++) { \ - DQQ[dz][qy][qx] = u[dz]; \ - } - -//2 * MPA_D1D * MPA_Q1D * MPA_Q1D * MPA_Q1D + MPA_Q1D * MPA_Q1D * MPA_Q1D -#define MASS3DPA_5 \ - Real_type u[MPA_Q1D]; \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qz = 0; qz < MPA_Q1D; qz++) { \ - u[qz] = 0; \ - } \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dz = 0; dz < MPA_D1D; ++dz) { \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qz = 0; qz < MPA_Q1D; qz++) { \ - u[qz] += DQQ[dz][qy][qx] * Bsmem[qz][dz]; \ - } \ - } \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qz = 0; qz < MPA_Q1D; qz++) { \ - QQQ[qz][qy][qx] = u[qz] * MPA_D(qx, qy, qz, e); \ - } - -#define MASS3DPA_6 \ - Btsmem[d][q] = MPA_Bt(q, d); - -//2 * MPA_Q1D * MPA_Q1D * MPA_Q1D * MPA_D1D -#define MASS3DPA_7 \ - Real_type u[MPA_Q1D]; \ -RAJAPERF_UNROLL(MQ1) \ -for (Index_type qz = 0; qz < MPA_Q1D; ++qz) { \ - u[qz] = 0; \ - } \ -RAJAPERF_UNROLL(MQ1) \ -for (Index_type qx = 0; qx < MPA_Q1D; ++qx) { \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qz = 0; qz < MPA_Q1D; ++qz) { \ - u[qz] += QQQ[qz][qy][qx] * Btsmem[dx][qx]; \ - } \ - } \ -RAJAPERF_UNROLL(MQ1) \ -for (Index_type qz = 0; qz < MPA_Q1D; ++qz) { \ - QQD[qz][qy][dx] = u[qz]; \ - } - -// 2 * MPA_Q1D * MPA_Q1D * MPA_D1D * MPA_D1D -#define MASS3DPA_8 \ - Real_type u[MPA_Q1D]; \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qz = 0; qz < MPA_Q1D; ++qz) { \ - u[qz] = 0; \ - } \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qy = 0; qy < MPA_Q1D; ++qy) { \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qz = 0; qz < MPA_Q1D; ++qz) { \ - u[qz] += QQD[qz][qy][dx] * Btsmem[dy][qy]; \ - } \ - } \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qz = 0; qz < MPA_Q1D; ++qz) { \ - QDD[qz][dy][dx] = u[qz]; \ - } - -//2 * MPA_Q1D * MPA_D1D * MPA_D1D * MPA_D1D + MPA_D1D * MPA_D1D * MPA_D1D -#define MASS3DPA_9 \ - Real_type u[MPA_D1D]; \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dz = 0; dz < MPA_D1D; ++dz) { \ - u[dz] = 0; \ - } \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qz = 0; qz < MPA_Q1D; ++qz) { \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dz = 0; dz < MPA_D1D; ++dz) { \ - u[dz] += QDD[qz][dy][dx] * Btsmem[dz][qz]; \ - } \ - } \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dz = 0; dz < MPA_D1D; ++dz) { \ - MPA_Y(dx, dy, dz, e) += u[dz]; \ - } - - -namespace rajaperf -{ +// Number of Dofs/Qpts in 1D +namespace mpa { +constexpr RAJA::Index_type D1D = 4; +constexpr RAJA::Index_type Q1D = 5; +} // namespace mpa +#define MPA_B(x, y) B[x + mpa::Q1D * y] +#define MPA_Bt(x, y) Bt[x + mpa::D1D * y] +#define MPA_X(dx, dy, dz, e) \ + X[dx + mpa::D1D * dy + mpa::D1D * mpa::D1D * dz + \ + mpa::D1D * mpa::D1D * mpa::D1D * e] +#define MPA_Y(dx, dy, dz, e) \ + Y[dx + mpa::D1D * dy + mpa::D1D * mpa::D1D * dz + \ + mpa::D1D * mpa::D1D * mpa::D1D * e] +#define MPA_D(qx, qy, qz, e) \ + D[qx + mpa::Q1D * qy + mpa::Q1D * mpa::Q1D * qz + \ + mpa::Q1D * mpa::Q1D * mpa::Q1D * e] + +#define MASS3DPA_0_CPU \ + constexpr Index_type MQ1 = mpa::Q1D; \ + constexpr Index_type MD1 = mpa::D1D; \ + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + Real_type sDQ[MQ1 * MD1]; \ + Real_type(*Bsmem)[MD1] = (Real_type(*)[MD1])sDQ; \ + Real_type(*Btsmem)[MQ1] = (Real_type(*)[MQ1])sDQ; \ + Real_type sm0[MDQ * MDQ * MDQ]; \ + Real_type sm1[MDQ * MDQ * MDQ]; \ + Real_type(*Xsmem)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ + Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ + Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ + Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ + Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ + Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; + +#define MASS3DPA_0_GPU \ + constexpr Index_type MQ1 = mpa::Q1D; \ + constexpr Index_type MD1 = mpa::D1D; \ + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_type sDQ[MQ1 * MD1]; \ + Real_type(*Bsmem)[MD1] = (Real_type(*)[MD1])sDQ; \ + Real_type(*Btsmem)[MQ1] = (Real_type(*)[MQ1])sDQ; \ + RAJA_TEAM_SHARED Real_type sm0[MDQ * MDQ * MDQ]; \ + RAJA_TEAM_SHARED Real_type sm1[MDQ * MDQ * MDQ]; \ + Real_type(*Xsmem)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ + Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ + Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ + Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ + Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ + Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; + +#define MASS3DPA_1 \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; ++dz) { \ + Xsmem[dz][dy][dx] = MPA_X(dx, dy, dz, e); \ + } + +#define MASS3DPA_2 Bsmem[dx][dy] = MPA_B(dx, dy); + +// 2 * mpa::D1D * mpa::D1D * mpa::D1D * mpa::Q1D +#define MASS3DPA_3 \ + Real_type u[mpa::D1D]; \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; dz++) { \ + u[dz] = 0; \ + } \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dx = 0; dx < mpa::D1D; ++dx) { \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; ++dz) { \ + u[dz] += Xsmem[dz][dy][dx] * Bsmem[qx][dx]; \ + } \ + } \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; ++dz) { \ + DDQ[dz][dy][qx] = u[dz]; \ + } + +// 2 * mpa::D1D * mpa::D1D * mpa::Q1D * mpa::Q1D +#define MASS3DPA_4 \ + Real_type u[mpa::D1D]; \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; dz++) { \ + u[dz] = 0; \ + } \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dy = 0; dy < mpa::D1D; ++dy) { \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; dz++) { \ + u[dz] += DDQ[dz][dy][qx] * Bsmem[qy][dy]; \ + } \ + } \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; dz++) { \ + DQQ[dz][qy][qx] = u[dz]; \ + } + +// 2 * mpa::D1D * mpa::Q1D * mpa::Q1D * mpa::Q1D + mpa::Q1D * mpa::Q1D * +// mpa::Q1D +#define MASS3DPA_5 \ + Real_type u[mpa::Q1D]; \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < mpa::Q1D; qz++) { \ + u[qz] = 0; \ + } \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; ++dz) { \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < mpa::Q1D; qz++) { \ + u[qz] += DQQ[dz][qy][qx] * Bsmem[qz][dz]; \ + } \ + } \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < mpa::Q1D; qz++) { \ + QQQ[qz][qy][qx] = u[qz] * MPA_D(qx, qy, qz, e); \ + } + +#define MASS3DPA_6 Btsmem[d][q] = MPA_Bt(q, d); + +// 2 * mpa::Q1D * mpa::Q1D * mpa::Q1D * mpa::D1D +#define MASS3DPA_7 \ + Real_type u[mpa::Q1D]; \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < mpa::Q1D; ++qz) { \ + u[qz] = 0; \ + } \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qx = 0; qx < mpa::Q1D; ++qx) { \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < mpa::Q1D; ++qz) { \ + u[qz] += QQQ[qz][qy][qx] * Btsmem[dx][qx]; \ + } \ + } \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < mpa::Q1D; ++qz) { \ + QQD[qz][qy][dx] = u[qz]; \ + } + +// 2 * mpa::Q1D * mpa::Q1D * mpa::D1D * mpa::D1D +#define MASS3DPA_8 \ + Real_type u[mpa::Q1D]; \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < mpa::Q1D; ++qz) { \ + u[qz] = 0; \ + } \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qy = 0; qy < mpa::Q1D; ++qy) { \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < mpa::Q1D; ++qz) { \ + u[qz] += QQD[qz][qy][dx] * Btsmem[dy][qy]; \ + } \ + } \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < mpa::Q1D; ++qz) { \ + QDD[qz][dy][dx] = u[qz]; \ + } + +// 2 * mpa::Q1D * mpa::D1D * mpa::D1D * mpa::D1D + mpa::D1D * mpa::D1D * +// mpa::D1D +#define MASS3DPA_9 \ + Real_type u[mpa::D1D]; \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; ++dz) { \ + u[dz] = 0; \ + } \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < mpa::Q1D; ++qz) { \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; ++dz) { \ + u[dz] += QDD[qz][dy][dx] * Btsmem[dz][qz]; \ + } \ + } \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < mpa::D1D; ++dz) { \ + MPA_Y(dx, dy, dz, e) += u[dz]; \ + } + +namespace rajaperf { class RunParams; -namespace apps -{ +namespace apps { -class MASS3DPA : public KernelBase -{ +class MASS3DPA : public KernelBase { public: - - MASS3DPA(const RunParams& params); + MASS3DPA(const RunParams ¶ms); ~MASS3DPA(); @@ -367,15 +367,12 @@ class MASS3DPA : public KernelBase void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); - template < size_t work_group_size > - void runSyclVariantImpl(VariantID vid); + template void runCudaVariantImpl(VariantID vid); + template void runHipVariantImpl(VariantID vid); + template void runSyclVariantImpl(VariantID vid); private: - static const size_t default_gpu_block_size = MPA_Q1D * MPA_Q1D; + static const size_t default_gpu_block_size = mpa::Q1D * mpa::Q1D; using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; diff --git a/src/apps/MASS3DPA_ATOMIC-Cuda.cpp b/src/apps/MASS3DPA_ATOMIC-Cuda.cpp new file mode 100644 index 000000000..968c1dab1 --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC-Cuda.cpp @@ -0,0 +1,307 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t block_size > + __launch_bounds__(block_size) +__global__ void Mass3DPA_Atomic(const Real_ptr B, + const Real_ptr D, const Real_ptr X, const Index_ptr ElemToDoF, Real_ptr Y) { + + const Index_type e = blockIdx.x; + + MASS3DPA_ATOMIC_0_GPU; + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa_at::D1D) { + MASS3DPA_ATOMIC_1; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(dz, z, 1) { + GPU_FOREACH_THREAD_DIRECT(d, y, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(q, x, mpa_at::Q1D) { + MASS3DPA_ATOMIC_2; + } + } + } + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa_at::Q1D) { + MASS3DPA_ATOMIC_3; + } + } + } + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa_at::Q1D) { + MASS3DPA_ATOMIC_4; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa_at::Q1D) { + MASS3DPA_ATOMIC_5; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa_at::D1D) { + MASS3DPA_ATOMIC_6; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa_at::D1D) { + MASS3DPA_ATOMIC_7; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa_at::D1D) { + MASS3DPA_ATOMIC_8; + MASS3DPA_ATOMIC_9(RAJAPERF_ATOMIC_ADD_CUDA); + } + } + } + +} + +template < size_t block_size > +void MASS3DPA_ATOMIC::runCudaVariantImpl(VariantID vid) { + setBlockSize(block_size); + + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + MASS3DPA_ATOMIC_DATA_SETUP; + + switch (vid) { + + case Base_CUDA: { + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + dim3 nthreads_per_block(mpa_at::Q1D, mpa_at::Q1D, mpa_at::Q1D); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (Mass3DPA_Atomic), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, D, X, ElemToDoF, Y ); + + } + stopTimer(); + + break; + } + + case RAJA_CUDA: { + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy>; + + using inner_y = RAJA::LoopPolicy>; + + using inner_z = RAJA::LoopPolicy>; + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + //clang-format off + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(mpa_at::Q1D, mpa_at::Q1D, mpa_at::Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + + MASS3DPA_ATOMIC_0_GPU; + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_1; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type q) { + MASS3DPA_ATOMIC_2; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_6; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_7; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_8; + MASS3DPA_ATOMIC_9(RAJAPERF_ATOMIC_ADD_RAJA_CUDA); + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + //clang-format on + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n MASS3DPA_ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DPA_ATOMIC, Cuda, Base_CUDA, RAJA_CUDA) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/apps/MASS3DPA_ATOMIC-Hip.cpp b/src/apps/MASS3DPA_ATOMIC-Hip.cpp new file mode 100644 index 000000000..6d8ce9ea5 --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC-Hip.cpp @@ -0,0 +1,307 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t block_size > + __launch_bounds__(block_size) +__global__ void Mass3DPA_Atomic(const Real_ptr B, + const Real_ptr D, const Real_ptr X, const Index_ptr ElemToDoF, Real_ptr Y) { + + const Index_type e = blockIdx.x; + + MASS3DPA_ATOMIC_0_GPU; + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa_at::D1D) { + MASS3DPA_ATOMIC_1; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(dz, z, 1) { + GPU_FOREACH_THREAD_DIRECT(d, y, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(q, x, mpa_at::Q1D) { + MASS3DPA_ATOMIC_2; + } + } + } + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa_at::Q1D) { + MASS3DPA_ATOMIC_3; + } + } + } + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa_at::Q1D) { + MASS3DPA_ATOMIC_4; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa_at::Q1D) { + MASS3DPA_ATOMIC_5; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa_at::D1D) { + MASS3DPA_ATOMIC_6; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa_at::D1D) { + MASS3DPA_ATOMIC_7; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa_at::D1D) { + MASS3DPA_ATOMIC_8; + MASS3DPA_ATOMIC_9(RAJAPERF_ATOMIC_ADD_HIP); + } + } + } + +} + +template < size_t block_size > +void MASS3DPA_ATOMIC::runHipVariantImpl(VariantID vid) { + setBlockSize(block_size); + + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + MASS3DPA_ATOMIC_DATA_SETUP; + + switch (vid) { + + case Base_HIP: { + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + dim3 nthreads_per_block(mpa_at::Q1D, mpa_at::Q1D, mpa_at::Q1D); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (Mass3DPA_Atomic), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, D, X, ElemToDoF, Y ); + + } + stopTimer(); + + break; + } + + case RAJA_HIP: { + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy>; + + using inner_y = RAJA::LoopPolicy>; + + using inner_z = RAJA::LoopPolicy>; + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + //clang-format off + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(mpa_at::Q1D, mpa_at::Q1D, mpa_at::Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + + MASS3DPA_ATOMIC_0_GPU; + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_1; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type q) { + MASS3DPA_ATOMIC_2; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_6; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_7; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_8; + MASS3DPA_ATOMIC_9(RAJAPERF_ATOMIC_ADD_RAJA_HIP); + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + //clang-format on + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n MASS3DPA_ATOMIC : Unknown Hip variant id = " << vid << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DPA_ATOMIC, Hip, Base_HIP, RAJA_HIP) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/apps/MASS3DPA_ATOMIC-OMP.cpp b/src/apps/MASS3DPA_ATOMIC-OMP.cpp new file mode 100644 index 000000000..3e404a085 --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC-OMP.cpp @@ -0,0 +1,251 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf { +namespace apps { + + +void MASS3DPA_ATOMIC::runOpenMPVariant(VariantID vid) { + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + MASS3DPA_ATOMIC_DATA_SETUP; + + switch (vid) { + + case Base_OpenMP: { + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + +#pragma omp parallel for + for (Index_type e = 0; e < NE; ++e) { + + MASS3DPA_ATOMIC_0_CPU; + + SHARED_LOOP_3D(dx, dy, dz, mpa_at::D1D, mpa_at::D1D, mpa_at::D1D) { + MASS3DPA_ATOMIC_1; + } + + SHARED_LOOP_2D(q, d, mpa_at::Q1D, mpa_at::D1D) { + MASS3DPA_ATOMIC_2; + } + + SHARED_LOOP_3D(qx, dy, dz, mpa_at::Q1D, mpa_at::D1D, mpa_at::D1D) { + MASS3DPA_ATOMIC_3; + } + + SHARED_LOOP_3D(qx, qy, dz, mpa_at::Q1D, mpa_at::Q1D, mpa_at::D1D) { + MASS3DPA_ATOMIC_4; + } + + SHARED_LOOP_3D(qx, qy, qz, mpa_at::Q1D, mpa_at::Q1D, mpa_at::Q1D) { + MASS3DPA_ATOMIC_5; + } + + SHARED_LOOP_3D(dx, qy, qz, mpa_at::D1D, mpa_at::Q1D, mpa_at::Q1D) { + MASS3DPA_ATOMIC_6; + } + + SHARED_LOOP_3D(dx, dy, qz, mpa_at::D1D, mpa_at::D1D, mpa_at::Q1D) { + MASS3DPA_ATOMIC_7; + } + + SHARED_LOOP_3D(dx, dy, dz, mpa_at::D1D, mpa_at::D1D, mpa_at::D1D) { + MASS3DPA_ATOMIC_8; + MASS3DPA_ATOMIC_9(RAJAPERF_ATOMIC_ADD_OMP); + } + + } // element loop + } + stopTimer(); + + break; + } + + case RAJA_OpenMP: { + + auto res{getHostResource()}; + + using launch_policy = RAJA::LaunchPolicy; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + //Grid is empty as the host does not need a compute grid to be specified + //clang-format off + RAJA::launch( res, + RAJA::LaunchParams(), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + MASS3DPA_ATOMIC_0_CPU; + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_1; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type q) { + MASS3DPA_ATOMIC_2; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_6; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_7; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_8; + MASS3DPA_ATOMIC_9(RAJAPERF_ATOMIC_ADD_RAJA_OMP); + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // // RAJA::launch + //clang-format on + + } // loop over kernel reps + stopTimer(); + + return; + } + + default: + getCout() << "\n MASS3DPA_ATOMIC : Unknown OpenMP variant id = " << vid + << std::endl; + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +RAJAPERF_DEFAULT_TUNING_DEFINE_BOILERPLATE(MASS3DPA_ATOMIC, OpenMP, Base_OpenMP, RAJA_OpenMP) + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MASS3DPA_ATOMIC-Seq.cpp b/src/apps/MASS3DPA_ATOMIC-Seq.cpp new file mode 100644 index 000000000..9bbea94df --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC-Seq.cpp @@ -0,0 +1,251 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives for loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf { +namespace apps { + + +void MASS3DPA_ATOMIC::runSeqVariant(VariantID vid) { + const Index_type run_reps = getRunReps(); + + MASS3DPA_ATOMIC_DATA_SETUP; + + switch (vid) { + + case Base_Seq: { + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) + { + + for (Index_type e = 0; e < NE; ++e) { + + MASS3DPA_ATOMIC_0_CPU; + + SHARED_LOOP_3D(dx, dy, dz, mpa_at::D1D, mpa_at::D1D, mpa_at::D1D) { + MASS3DPA_ATOMIC_1; + } + + SHARED_LOOP_2D(q, d, mpa_at::Q1D, mpa_at::D1D) { + MASS3DPA_ATOMIC_2; + } + + SHARED_LOOP_3D(qx, dy, dz, mpa_at::Q1D, mpa_at::D1D, mpa_at::D1D) { + MASS3DPA_ATOMIC_3; + } + + SHARED_LOOP_3D(qx, qy, dz, mpa_at::Q1D, mpa_at::Q1D, mpa_at::D1D) { + MASS3DPA_ATOMIC_4; + } + + SHARED_LOOP_3D(qx, qy, qz, mpa_at::Q1D, mpa_at::Q1D, mpa_at::Q1D) { + MASS3DPA_ATOMIC_5; + } + + SHARED_LOOP_3D(dx, qy, qz, mpa_at::D1D, mpa_at::Q1D, mpa_at::Q1D) { + MASS3DPA_ATOMIC_6; + } + + SHARED_LOOP_3D(dx, dy, qz, mpa_at::D1D, mpa_at::D1D, mpa_at::Q1D) { + MASS3DPA_ATOMIC_7; + } + + SHARED_LOOP_3D(dx, dy, dz, mpa_at::D1D, mpa_at::D1D, mpa_at::D1D) { + MASS3DPA_ATOMIC_8; + MASS3DPA_ATOMIC_9(RAJAPERF_ATOMIC_ADD_SEQ); + } + + } // element loop + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case RAJA_Seq: { + + auto res{getHostResource()}; + + //Currently Teams requires two policies if compiled with a device + using launch_policy = RAJA::LaunchPolicy; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + //clang-format off + RAJA::launch( res, + RAJA::LaunchParams(), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + + MASS3DPA_ATOMIC_0_CPU; + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_1; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type q) { + MASS3DPA_ATOMIC_2; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_6; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_7; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_8; + MASS3DPA_ATOMIC_9(RAJAPERF_ATOMIC_ADD_RAJA_SEQ); + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + //clang-format on + + } // loop over kernel reps + stopTimer(); + + return; + } +#endif // RUN_RAJA_SEQ + + default: + getCout() << "\n MASS3DPA_ATOMIC : Unknown Seq variant id = " << vid << std::endl; + } +} + +RAJAPERF_DEFAULT_TUNING_DEFINE_BOILERPLATE(MASS3DPA_ATOMIC, Seq, Base_Seq, RAJA_Seq) + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MASS3DPA_ATOMIC-Sycl.cpp b/src/apps/MASS3DPA_ATOMIC-Sycl.cpp new file mode 100644 index 000000000..f71973157 --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC-Sycl.cpp @@ -0,0 +1,364 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t work_group_size > +void MASS3DPA_ATOMIC::runSyclVariantImpl(VariantID vid) { + setBlockSize(work_group_size); + + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MASS3DPA_ATOMIC_DATA_SETUP; + + const ::sycl::range<3> workGroupSize(mpa_at::Q1D, mpa_at::Q1D, mpa_at::Q1D); + const ::sycl::range<3> gridSize(mpa_at::Q1D, mpa_at::Q1D, mpa_at::Q1D*NE); + + switch (vid) { + + case Base_SYCL: { + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + qu->submit([&](::sycl::handler& h) { + + constexpr Index_type MQ1 = mpa_at::Q1D; + constexpr Index_type MD1 = mpa_at::D1D; + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + auto smB_vec = ::sycl::local_accessor(::sycl::range<1>(MQ1*MD1), h); + auto smBt_vec = ::sycl::local_accessor(::sycl::range<1>(MQ1*MD1), h); + + auto sm0_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ * MDQ * MDQ), h); + auto sm1_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ * MDQ * MDQ), h); + + auto thread_dofs_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ * MDQ * MDQ), h); + + h.parallel_for + (::sycl::nd_range<3>(gridSize, workGroupSize), + [=] (::sycl::nd_item<3> itm) { + + const Index_type e = itm.get_group(2); + + auto smB_ptr = smB_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + auto smBt_ptr = smBt_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + Real_type (*sm_B)[MD1] = (Real_type (*)[MD1]) smB_ptr; + Real_type (*sm_Bt)[MQ1] = (Real_type (*)[MQ1]) smBt_ptr; + + auto sm0 = sm0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + auto sm1 = sm1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + Real_type (*sm_X)[MD1][MD1] = (Real_type (*)[MD1][MD1])sm0; + Real_type (*DDQ)[MD1][MQ1] = (Real_type (*)[MD1][MQ1])sm1; + Real_type (*DQQ)[MQ1][MQ1] = (Real_type (*)[MQ1][MQ1])sm0; + Real_type (*QQQ)[MQ1][MQ1] = (Real_type (*)[MQ1][MQ1])sm1; + Real_type (*QQD)[MQ1][MD1] = (Real_type (*)[MQ1][MD1])sm0; + Real_type (*QDD)[MD1][MD1] = (Real_type (*)[MD1][MD1])sm1; + + auto thread_dofs = thread_dofs_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + SYCL_FOREACH_THREAD_DIRECT(dz, 0, mpa_at::D1D) { + SYCL_FOREACH_THREAD_DIRECT(dy, 1, mpa_at::D1D) { + SYCL_FOREACH_THREAD_DIRECT(dx, 2, mpa_at::D1D) { + MASS3DPA_ATOMIC_1; + } + } + } + + SYCL_FOREACH_THREAD_DIRECT(dz, 0, 1) { + SYCL_FOREACH_THREAD_DIRECT(d, 1, mpa_at::D1D) { + SYCL_FOREACH_THREAD_DIRECT(q, 2, mpa_at::Q1D) { + MASS3DPA_ATOMIC_2; + } + } + } + + SYCL_FOREACH_THREAD_DIRECT(dz, 0, mpa_at::D1D) { + SYCL_FOREACH_THREAD_DIRECT(dy, 1, mpa_at::D1D) { + SYCL_FOREACH_THREAD_DIRECT(qx, 2, mpa_at::Q1D) { + MASS3DPA_ATOMIC_3; + } + } + } + + SYCL_FOREACH_THREAD_DIRECT(dz, 0, mpa_at::D1D) { + SYCL_FOREACH_THREAD_DIRECT(qy, 1, mpa_at::Q1D) { + SYCL_FOREACH_THREAD_DIRECT(qx, 2, mpa_at::Q1D) { + MASS3DPA_ATOMIC_4; + } + } + } + + SYCL_FOREACH_THREAD_DIRECT(qz, 0, mpa_at::Q1D) { + SYCL_FOREACH_THREAD_DIRECT(qy, 1, mpa_at::Q1D) { + SYCL_FOREACH_THREAD_DIRECT(qx, 2, mpa_at::Q1D) { + MASS3DPA_ATOMIC_5; + } + } + } + + SYCL_FOREACH_THREAD_DIRECT(qz, 0, mpa_at::Q1D) { + SYCL_FOREACH_THREAD_DIRECT(qy, 1, mpa_at::Q1D) { + SYCL_FOREACH_THREAD_DIRECT(dx, 2, mpa_at::D1D) { + MASS3DPA_ATOMIC_6; + } + } + } + + SYCL_FOREACH_THREAD_DIRECT(qz, 0, mpa_at::Q1D) { + SYCL_FOREACH_THREAD_DIRECT(dy, 1, mpa_at::D1D) { + SYCL_FOREACH_THREAD_DIRECT(dx, 2, mpa_at::D1D) { + MASS3DPA_ATOMIC_7; + } + } + } + + SYCL_FOREACH_THREAD_DIRECT(dz, 0, mpa_at::D1D) { + SYCL_FOREACH_THREAD_DIRECT(dy, 2, mpa_at::D1D) { + SYCL_FOREACH_THREAD_DIRECT(dx, 1, mpa_at::D1D) { + MASS3DPA_ATOMIC_8; + MASS3DPA_ATOMIC_9(RAJAPERF_ATOMIC_ADD_SYCL); + } + } + } + + + }); + }); + + } + stopTimer(); + + break; + } + + case RAJA_SYCL: { + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + //Caclulate amount of shared memory needed + size_t shmem = 0; + { + constexpr Index_type MQ1 = mpa_at::Q1D; + constexpr Index_type MD1 = mpa_at::D1D; + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + constexpr Index_type no_mats = 2; + shmem += MQ1 * MD1 * no_mats * sizeof(Real_type) + //B,Bt + MDQ * MDQ * MDQ * no_mats * sizeof(Real_type) + //sm0,sm1 + MD1 * MD1 * MD1 * sizeof(Index_type); //thread_dofs + } + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + //clang-format off + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(mpa_at::Q1D, mpa_at::Q1D, mpa_at::Q1D), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + constexpr int MQ1 = mpa_at::Q1D; + constexpr int MD1 = mpa_at::D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + Real_ptr smB_ptr = ctx.getSharedMemory(MQ1*MD1); + Real_ptr smBt_ptr = ctx.getSharedMemory(MQ1*MD1); + + Real_type (*sm_B)[MD1] = (Real_type (*)[MD1]) smB_ptr; + Real_type (*sm_Bt)[MQ1] = (Real_type (*)[MQ1]) smBt_ptr; + + Real_ptr sm0 = ctx.getSharedMemory(MDQ * MDQ * MDQ); + Real_ptr sm1 = ctx.getSharedMemory(MDQ * MDQ * MDQ); + + Real_type (*sm_X)[MD1][MD1] = (Real_type (*)[MD1][MD1])sm0; + Real_type (*DDQ)[MD1][MQ1] = (Real_type (*)[MD1][MQ1])sm1; + Real_type (*DQQ)[MQ1][MQ1] = (Real_type (*)[MQ1][MQ1])sm0; + Real_type (*QQQ)[MQ1][MQ1] = (Real_type (*)[MQ1][MQ1])sm1; + Real_type (*QQD)[MQ1][MD1] = (Real_type (*)[MQ1][MD1])sm0; + Real_type (*QDD)[MD1][MD1] = (Real_type (*)[MD1][MD1])sm1; + + Index_ptr thread_dofs = ctx.getSharedMemory(MD1 * MD1 * MD1); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_1; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type q) { + MASS3DPA_ATOMIC_2; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_6; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_7; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_8; + MASS3DPA_ATOMIC_9(RAJAPERF_ATOMIC_ADD_RAJA_SYCL); + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + //clang-format on + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n MASS3DPA_ATOMIC : Unknown Sycl variant id = " << vid << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DPA_ATOMIC, Sycl, Base_SYCL, RAJA_SYCL) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/MASS3DPA_ATOMIC.cpp b/src/apps/MASS3DPA_ATOMIC.cpp new file mode 100644 index 000000000..42e98128e --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC.cpp @@ -0,0 +1,119 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +MASS3DPA_ATOMIC::MASS3DPA_ATOMIC(const RunParams ¶ms) + : KernelBase(rajaperf::Apps_MASS3DPA_ATOMIC, params) { + + m_DOF_default = 1000000; + setDefaultProblemSize(m_DOF_default); + setDefaultReps(50); + + // polynomial order + m_P = mpa_at::D1D - 1; + + // approximate how many elements we need + m_NE = std::max(static_cast(getTargetProblemSize() / pow(m_P, 3)), + Index_type(1)); + + // Construct the mesh + m_Nx = static_cast(std::cbrt(m_NE)); + m_Ny = m_Nx; + m_Nz = m_Ny; + m_NE = m_Nx * m_Ny * m_Nz; + + // compute true number of dofs + m_Tot_Dofs = (m_Nx * m_P + 1) * (m_Ny * m_P + 1) * (m_Nz * m_P + 1); + + setActualProblemSize(m_Tot_Dofs); + + setItsPerRep(m_NE * mpa_at::D1D * mpa_at::D1D); + setKernelsPerRep(1); + + setBytesReadPerRep(2 * sizeof(Real_type) * mpa_at::Q1D * + mpa_at::D1D + // B, Bt + 1 * sizeof(Index_type) * mpa_at::D1D * mpa_at::D1D * + mpa_at::D1D * m_NE + // ElemToDoF + 1 * sizeof(Real_type) * m_Tot_Dofs + // X + 1 * sizeof(Real_type) * mpa_at::Q1D * mpa_at::Q1D * + mpa_at::Q1D * m_NE); // D + + setBytesWrittenPerRep( 0 ); + setBytesModifyWrittenPerRep( 1*sizeof(Real_type) * mpa_at::D1D*mpa_at::D1D*mpa_at::D1D*m_NE ); // Y + + setBytesAtomicModifyWrittenPerRep(1*sizeof(Real_type) * mpa_at::D1D*mpa_at::D1D*mpa_at::D1D*m_NE ); // Y + + setFLOPsPerRep( + m_NE * + (2 * mpa_at::D1D * mpa_at::D1D * mpa_at::D1D * mpa_at::Q1D + + 2 * mpa_at::D1D * mpa_at::D1D * mpa_at::Q1D * mpa_at::Q1D + + 2 * mpa_at::D1D * mpa_at::Q1D * mpa_at::Q1D * mpa_at::Q1D + + mpa_at::Q1D * mpa_at::Q1D * mpa_at::Q1D + + 2 * mpa_at::Q1D * mpa_at::Q1D * mpa_at::Q1D * mpa_at::D1D + + 2 * mpa_at::Q1D * mpa_at::Q1D * mpa_at::D1D * mpa_at::D1D + + 2 * mpa_at::Q1D * mpa_at::D1D * mpa_at::D1D * mpa_at::D1D + + mpa_at::D1D * mpa_at::D1D * mpa_at::D1D)); + + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + setChecksumTolerance(ChecksumTolerance::normal); + + setComplexity(Complexity::N); + + setUsesFeature(Launch); + + addVariantTunings(); +} + +MASS3DPA_ATOMIC::~MASS3DPA_ATOMIC() {} + +void MASS3DPA_ATOMIC::setUp(VariantID vid, + size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + + allocAndInitDataConst(m_B, Index_type(mpa_at::Q1D * mpa_at::D1D), + Real_type(1.0), vid); + allocAndInitDataConst( + m_D, Index_type(mpa_at::Q1D * mpa_at::Q1D * mpa_at::Q1D * m_NE), + Real_type(1.0), vid); + allocAndInitDataConst(m_X, Index_type(m_Tot_Dofs), Real_type(1.0), vid); + allocAndInitDataConst(m_Y, Index_type(m_Tot_Dofs), Real_type(0.0), vid); + + // Compute table elem to dof table size + const int ndof_per_elem = (m_P + 1) * (m_P + 1) * (m_P + 1); + const int total_size = ndof_per_elem * m_NE; + + auto a_elemToDoF = allocDataForInit(m_ElemToDoF, total_size, vid); + buildElemToDofTable(m_Nx, m_Ny, m_Nz, m_P, m_ElemToDoF); +} + +void MASS3DPA_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) { + addToChecksum(m_Y, m_Tot_Dofs, vid); +} + +void MASS3DPA_ATOMIC::tearDown(VariantID vid, + size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + (void)vid; + + deallocData(m_B, vid); + deallocData(m_D, vid); + deallocData(m_X, vid); + deallocData(m_Y, vid); + deallocData(m_ElemToDoF, vid); +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MASS3DPA_ATOMIC.hpp b/src/apps/MASS3DPA_ATOMIC.hpp new file mode 100644 index 000000000..5b0a71814 --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC.hpp @@ -0,0 +1,370 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Action of 3D mass matrix via partial assembly +/// +/// +/// for (Index_type e = 0; e < NE; ++e) { +/// +/// constexpr Index_type MQ1 = mpa_at::Q1D; +/// constexpr Index_type MD1 = mpa_at::D1D; +/// +/// constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; +/// +/// Real_type sm_B[MQ1][MD1]; +/// Real_type sm_Bt[MD1][MQ1]; +/// +/// Real_type sm0[MDQ * MDQ * MDQ]; +/// Real_type sm1[MDQ * MDQ * MDQ]; +/// Real_type(*sm_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; +/// Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; +/// Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; +/// Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; +/// Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; +/// Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; +/// +/// Index_type thread_dofs[MD1 * MD1 * MD1]; +/// +/// for(Index_type dz=0; dz MD1) ? MQ1 : MD1; \ + Real_type sm_B[MQ1][MD1]; \ + Real_type sm_Bt[MD1][MQ1]; \ + Real_type sm0[MDQ * MDQ * MDQ]; \ + Real_type sm1[MDQ * MDQ * MDQ]; \ + Real_type(*sm_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ + Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ + Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ + Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ + Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ + Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; \ + Index_type thread_dofs[MD1 * MD1 * MD1]; + +#define MASS3DPA_ATOMIC_0_GPU \ + constexpr Index_type MQ1 = mpa_at::Q1D; \ + constexpr Index_type MD1 = mpa_at::D1D; \ + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_type sm_B[MQ1][MD1]; \ + RAJA_TEAM_SHARED Real_type sm_Bt[MD1][MQ1]; \ + RAJA_TEAM_SHARED Real_type sm0[MDQ * MDQ * MDQ]; \ + RAJA_TEAM_SHARED Real_type sm1[MDQ * MDQ * MDQ]; \ + Real_type(*sm_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ + Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ + Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ + Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ + Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ + Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; \ + RAJA_TEAM_SHARED Index_type thread_dofs[MD1 * MD1 * MD1]; + +#define MASS3DPA_ATOMIC_1 \ + Index_type j = dx + mpa_at::D1D * (dy + dz * mpa_at::D1D); \ + thread_dofs[j] = ElemToDoF[j + mpa_at::D1D * mpa_at::D1D * mpa_at::D1D * e]; \ + sm_X[dz][dy][dx] = \ + X[thread_dofs[j]]; // missing dof_map for lexicographical ordering + +#define MASS3DPA_ATOMIC_2 \ + sm_B[q][d] = MPAT_B(q, d); \ + sm_Bt[d][q] = sm_B[q][d]; + +// flop counts +// 2 * D1D +#define MASS3DPA_ATOMIC_3 \ + Real_type u = 0.0; \ + for (Index_type dx = 0; dx < mpa_at::D1D; ++dx) { \ + u += sm_X[dz][dy][dx] * sm_B[qx][dx]; \ + } \ + DDQ[dz][dy][qx] = u; + +// 2 * D1D +#define MASS3DPA_ATOMIC_4 \ + Real_type u = 0.0; \ + for (Index_type dy = 0; dy < mpa_at::D1D; ++dy) { \ + u += DDQ[dz][dy][qx] * sm_B[qy][dy]; \ + } \ + DQQ[dz][qy][qx] = u; + +// 2 * D1D + 1 +#define MASS3DPA_ATOMIC_5 \ + Real_type u = 0.0; \ + for (Index_type dz = 0; dz < mpa_at::D1D; ++dz) { \ + u += DQQ[dz][qy][qx] * sm_B[qz][dz]; \ + } \ + QQQ[qz][qy][qx] = u * MPAT_D(qx, qy, qz, e); + +// 2 * Q1D +#define MASS3DPA_ATOMIC_6 \ + Real_type u = 0.0; \ + for (Index_type qx = 0; qx < mpa_at::Q1D; ++qx) { \ + u += QQQ[qz][qy][qx] * sm_Bt[dx][qx]; \ + } \ + QQD[qz][qy][dx] = u; + +// 2 * Q1D +#define MASS3DPA_ATOMIC_7 \ + Real_type u = 0.0; \ + for (Index_type qy = 0; qy < mpa_at::Q1D; ++qy) { \ + u += QQD[qz][qy][dx] * sm_Bt[dy][qy]; \ + } \ + QDD[qz][dy][dx] = u; + +// 2 * Q1D + 1 +#define MASS3DPA_ATOMIC_8 \ + Real_type u = 0.0; \ + for (Index_type qz = 0; qz < mpa_at::Q1D; ++qz) { \ + u += QDD[qz][dy][dx] * sm_Bt[dz][qz]; \ + } \ + const Index_type j = dx + mpa_at::D1D * (dy + dz * mpa_at::D1D); + +#define MASS3DPA_ATOMIC_9(atomicAdd) \ + atomicAdd(Y[thread_dofs[j]], u); // atomic add + +namespace rajaperf { +class RunParams; + +namespace apps { + +/** + * Build element-to-DOF connectivity for a structured 3D hex mesh + * with arbitrary polynomial order p and 1 DOF per node. + * + * Inputs: + * Nx, Ny, Nz : number of elements in x, y, z directions + * p : polynomial order (>=1) + * + * Outputs: + * elem_to_dofs : size = num_elems + * each entry is a vector of size (p+1)^3 + * containing the global DOF indices of that element + * + * Element numbering: + * elem_id = ex + Nx * (ey + Ny * ez) + */ +inline void +buildElemToDofTable(Index_type Nx, Index_type Ny, Index_type Nz, Index_type p, + Index_ptr elemToDof) // output buffer, must be preallocated +{ + const Index_type num_nodes_x = Nx * p + 1; + const Index_type num_nodes_y = Ny * p + 1; + + const Index_type ndof_per_elem = (p + 1) * (p + 1) * (p + 1); + + // Loop over elements + for (Index_type ez = 0; ez < Nz; ++ez) { + for (Index_type ey = 0; ey < Ny; ++ey) { + for (Index_type ex = 0; ex < Nx; ++ex) { + // Global element index (row in elemToDof) + Index_type e = ex + Nx * (ey + Ny * ez); + + // Pointer to start of this element's DOF list + Index_ptr row = elemToDof + e * ndof_per_elem; + + Index_type local = 0; + + // Loop over local nodes of the element + for (Index_type kz = 0; kz <= p; ++kz) { + Index_type iz = ez * p + kz; + for (Index_type ky = 0; ky <= p; ++ky) { + Index_type iy = ey * p + ky; + for (Index_type kx = 0; kx <= p; ++kx) { + Index_type ix = ex * p + kx; + + Index_type nodeID = ix + num_nodes_x * (iy + num_nodes_y * iz); + + // Scalar DOF per node, so dofID == nodeID + Index_type dofID = nodeID; + + row[local++] = dofID; + } + } + } + } + } + } +} + +class MASS3DPA_ATOMIC : public KernelBase { +public: + MASS3DPA_ATOMIC(const RunParams ¶ms); + + ~MASS3DPA_ATOMIC(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void defineSeqVariantTunings(); + void defineOpenMPVariantTunings(); + void defineCudaVariantTunings(); + void defineHipVariantTunings(); + void defineSyclVariantTunings(); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + + template void runCudaVariantImpl(VariantID vid); + template void runHipVariantImpl(VariantID vid); + template void runSyclVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = + mpa_at::Q1D * mpa_at::Q1D * mpa_at::Q1D; + using gpu_block_sizes_type = integer::list_type; + + Real_ptr m_B; + Real_ptr m_Bt; + Real_ptr m_D; + Real_ptr m_X; + Real_ptr m_Y; + + Index_type m_Nx; // zones in x dimension + Index_type m_Ny; // zones in y dimension + Index_type m_Nz; // zones in z dimension + Index_type m_P; // polynomial order + Index_type m_Tot_Dofs; // total number of dofs + + Index_ptr m_ElemToDoF; + + Index_type m_NE; + Index_type m_DOF_default; +}; + +} // end namespace apps +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/apps/MASSVEC3DPA-Cuda.cpp b/src/apps/MASSVEC3DPA-Cuda.cpp index f249bc939..d39cf7504 100644 --- a/src/apps/MASSVEC3DPA-Cuda.cpp +++ b/src/apps/MASSVEC3DPA-Cuda.cpp @@ -21,7 +21,7 @@ namespace apps { template __launch_bounds__(block_size) __global__ -void MassVec3DPA_BLOCKDIM_LOOP_INC(const Real_ptr B, const Real_ptr Bt, +void MassVec3DPA_BLOCKDIM_LOOP_INC(const Real_ptr B, const Real_ptr D, const Real_ptr X, Real_ptr Y) { @@ -30,42 +30,42 @@ void MassVec3DPA_BLOCKDIM_LOOP_INC(const Real_ptr B, const Real_ptr Bt, MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D(q, d, MVPA_Q1D, MVPA_D1D) { + GPU_SHARED_LOOP_2D(q, d, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { - GPU_SHARED_LOOP_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_LOOP_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_2; } __syncthreads(); - GPU_SHARED_LOOP_3D(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_LOOP_3D(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_3; } __syncthreads(); - GPU_SHARED_LOOP_3D(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D) { + GPU_SHARED_LOOP_3D(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_4; } __syncthreads(); - GPU_SHARED_LOOP_3D(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { + GPU_SHARED_LOOP_3D(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_5; } __syncthreads(); - GPU_SHARED_LOOP_3D(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { + GPU_SHARED_LOOP_3D(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_6; } __syncthreads(); - GPU_SHARED_LOOP_3D(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D) { + GPU_SHARED_LOOP_3D(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D) { MASSVEC3DPA_7; } __syncthreads(); - GPU_SHARED_LOOP_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_LOOP_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_8; } __syncthreads(); @@ -75,7 +75,7 @@ void MassVec3DPA_BLOCKDIM_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ -void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, +void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr D, const Real_ptr X, Real_ptr Y, const Index_type runtime_block_size) @@ -85,48 +85,48 @@ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, runtime_block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, mvpa::Q1D, mvpa::D1D, runtime_block_size) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { - GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D, runtime_block_size) { MASSVEC3DPA_2; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D, runtime_block_size) { MASSVEC3DPA_3; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D, runtime_block_size) { MASSVEC3DPA_4; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D, runtime_block_size) { MASSVEC3DPA_5; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D, runtime_block_size) { MASSVEC3DPA_6; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D, runtime_block_size) { MASSVEC3DPA_7; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D, runtime_block_size) { MASSVEC3DPA_8; } @@ -137,7 +137,7 @@ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ -void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, +void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr D, const Real_ptr X, Real_ptr Y) { @@ -146,48 +146,48 @@ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, mvpa::Q1D, mvpa::D1D, block_size) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { - GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D, block_size) { MASSVEC3DPA_2; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D, block_size) { MASSVEC3DPA_3; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D, block_size) { MASSVEC3DPA_4; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D, block_size) { MASSVEC3DPA_5; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D, block_size) { MASSVEC3DPA_6; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D, block_size) { MASSVEC3DPA_7; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D, block_size) { MASSVEC3DPA_8; } @@ -198,7 +198,7 @@ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ -void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, +void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr D, const Real_ptr X, Real_ptr Y) { @@ -207,42 +207,42 @@ void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, MASSVEC3DPA_0_GPU; - GPU_SHARED_DIRECT_2D(q, d, MVPA_Q1D, MVPA_D1D) { + GPU_SHARED_DIRECT_2D(q, d, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { - GPU_SHARED_DIRECT_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_DIRECT_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_2; } __syncthreads(); - GPU_SHARED_DIRECT_3D(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_DIRECT_3D(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_3; } __syncthreads(); - GPU_SHARED_DIRECT_3D(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D) { + GPU_SHARED_DIRECT_3D(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_4; } __syncthreads(); - GPU_SHARED_DIRECT_3D(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { + GPU_SHARED_DIRECT_3D(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_5; } __syncthreads(); - GPU_SHARED_DIRECT_3D(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { + GPU_SHARED_DIRECT_3D(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_6; } __syncthreads(); - GPU_SHARED_DIRECT_3D(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D) { + GPU_SHARED_DIRECT_3D(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D) { MASSVEC3DPA_7; } __syncthreads(); - GPU_SHARED_DIRECT_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_DIRECT_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_8; } __syncthreads(); @@ -250,6 +250,169 @@ void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, } // (c) dimension loop } +template +void MASSVEC3DPA::runRAJAImpl(RESOURCE &res) +{ + + MASSVEC3DPA_DATA_SETUP; + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy< + RAJA::cuda_launch_t>; + + using outer_x = RAJA::LoopPolicy; + + //clang-format off + RAJA::launch( + res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + MASSVEC3DPA_0_GPU + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type d) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type q) { + MASSVEC3DPA_1; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + for (Index_type c = 0; c < 3; ++c) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dx) { + MASSVEC3DPA_2; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qx) { + MASSVEC3DPA_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qx) { + MASSVEC3DPA_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qx) { + MASSVEC3DPA_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dx) { + MASSVEC3DPA_6; + } // lambda (dx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dx) { + MASSVEC3DPA_7; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dx) { + MASSVEC3DPA_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + } // c - dim loop + } // lambda (e) + ); // RAJA::loop + } // outer lambda (ctx) + ); // RAJA::launch + //clang-format on + +} + template void MASSVEC3DPA::runCudaVariantImpl(VariantID vid) { @@ -272,11 +435,11 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + dim3 nthreads_per_block(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D); constexpr size_t shmem = 0; RPlaunchCudaKernel((MassVec3DPA_BLOCKDIM_LOOP_INC), NE, - nthreads_per_block, shmem, res.get_stream(), B, Bt, D, + nthreads_per_block, shmem, res.get_stream(), B, D, X, Y); } stopTimer(); @@ -288,12 +451,12 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + dim3 nthreads_per_block(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D); constexpr size_t shmem = 0; RPlaunchCudaKernel((MassVec3DPA_ARGUMENT_LOOP_INC), NE, - nthreads_per_block, shmem, res.get_stream(), B, Bt, D, - X, Y, static_cast(MVPA_Q1D)); + nthreads_per_block, shmem, res.get_stream(), B, D, + X, Y, static_cast(mvpa::Q1D)); } stopTimer(); @@ -303,11 +466,11 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + dim3 nthreads_per_block(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D); constexpr size_t shmem = 0; RPlaunchCudaKernel((MassVec3DPA_COMPILE_LOOP_INC), NE, - nthreads_per_block, shmem, res.get_stream(), B, Bt, D, + nthreads_per_block, shmem, res.get_stream(), B, D, X, Y); } stopTimer(); @@ -318,11 +481,11 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + dim3 nthreads_per_block(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D); constexpr size_t shmem = 0; RPlaunchCudaKernel((MassVec3DPA_DIRECT), NE, - nthreads_per_block, shmem, res.get_stream(), B, Bt, D, + nthreads_per_block, shmem, res.get_stream(), B, D, X, Y); } stopTimer(); @@ -333,10 +496,6 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid) case RAJA_CUDA: { - constexpr bool async = true; - - using launch_policy = RAJA::LaunchPolicy< - RAJA::cuda_launch_t>; using outer_x = RAJA::LoopPolicy; @@ -352,150 +511,7 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); @@ -503,161 +519,17 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid) if constexpr (tune_idx == 1) { - using inner_x = RAJA::LoopPolicy>; + using inner_x = RAJA::LoopPolicy>; - using inner_y = RAJA::LoopPolicy>; + using inner_y = RAJA::LoopPolicy>; - using inner_z = RAJA::LoopPolicy>; + using inner_z = RAJA::LoopPolicy>; startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); @@ -675,149 +547,7 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASSVEC3DPA-Hip.cpp b/src/apps/MASSVEC3DPA-Hip.cpp index 10e62c095..d6b2a43e9 100644 --- a/src/apps/MASSVEC3DPA-Hip.cpp +++ b/src/apps/MASSVEC3DPA-Hip.cpp @@ -21,7 +21,7 @@ namespace apps { template __launch_bounds__(block_size) __global__ -void MassVec3DPA_BLOCKDIM_LOOP_INC(const Real_ptr B, const Real_ptr Bt, +void MassVec3DPA_BLOCKDIM_LOOP_INC(const Real_ptr B, const Real_ptr D, const Real_ptr X, Real_ptr Y) { @@ -30,42 +30,42 @@ void MassVec3DPA_BLOCKDIM_LOOP_INC(const Real_ptr B, const Real_ptr Bt, MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D(q, d, MVPA_Q1D, MVPA_D1D) { + GPU_SHARED_LOOP_2D(q, d, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { - GPU_SHARED_LOOP_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_LOOP_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_2; } __syncthreads(); - GPU_SHARED_LOOP_3D(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_LOOP_3D(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_3; } __syncthreads(); - GPU_SHARED_LOOP_3D(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D) { + GPU_SHARED_LOOP_3D(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_4; } __syncthreads(); - GPU_SHARED_LOOP_3D(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { + GPU_SHARED_LOOP_3D(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_5; } __syncthreads(); - GPU_SHARED_LOOP_3D(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { + GPU_SHARED_LOOP_3D(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_6; } __syncthreads(); - GPU_SHARED_LOOP_3D(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D) { + GPU_SHARED_LOOP_3D(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D) { MASSVEC3DPA_7; } __syncthreads(); - GPU_SHARED_LOOP_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_LOOP_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_8; } __syncthreads(); @@ -75,58 +75,58 @@ void MassVec3DPA_BLOCKDIM_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ -void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, - Real_ptr Y, - const Index_type runtime_block_size) +void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, + const Real_ptr D, const Real_ptr X, + Real_ptr Y, + const Index_type runtime_block_size) { const Index_type e = blockIdx.x; MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, runtime_block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, mvpa::Q1D, mvpa::D1D, runtime_block_size) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { - GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D, runtime_block_size) { MASSVEC3DPA_2; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D, runtime_block_size) { MASSVEC3DPA_3; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D, runtime_block_size) { MASSVEC3DPA_4; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D, runtime_block_size) { MASSVEC3DPA_5; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D, runtime_block_size) { MASSVEC3DPA_6; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D, runtime_block_size) { MASSVEC3DPA_7; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D, runtime_block_size) { MASSVEC3DPA_8; } @@ -137,7 +137,7 @@ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ -void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, +void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr D, const Real_ptr X, Real_ptr Y) { @@ -146,48 +146,48 @@ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, mvpa::Q1D, mvpa::D1D, block_size) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { - GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D, block_size) { MASSVEC3DPA_2; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D, block_size) { MASSVEC3DPA_3; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D, block_size) { MASSVEC3DPA_4; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D, block_size) { MASSVEC3DPA_5; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D, block_size) { MASSVEC3DPA_6; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D, block_size) { MASSVEC3DPA_7; } __syncthreads(); - GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D, block_size) { MASSVEC3DPA_8; } @@ -198,50 +198,51 @@ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ -void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, Real_ptr Y) +void MassVec3DPA_DIRECT(const Real_ptr B, + const Real_ptr D, const Real_ptr X, + Real_ptr Y) { const Index_type e = blockIdx.x; MASSVEC3DPA_0_GPU; - GPU_SHARED_DIRECT_2D(q, d, MVPA_Q1D, MVPA_D1D) { + GPU_SHARED_DIRECT_2D(q, d, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { - GPU_SHARED_DIRECT_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_DIRECT_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_2; } __syncthreads(); - GPU_SHARED_DIRECT_3D(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_DIRECT_3D(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_3; } __syncthreads(); - GPU_SHARED_DIRECT_3D(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D) { + GPU_SHARED_DIRECT_3D(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_4; } __syncthreads(); - GPU_SHARED_DIRECT_3D(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { + GPU_SHARED_DIRECT_3D(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_5; } __syncthreads(); - GPU_SHARED_DIRECT_3D(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { + GPU_SHARED_DIRECT_3D(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_6; } __syncthreads(); - GPU_SHARED_DIRECT_3D(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D) { + GPU_SHARED_DIRECT_3D(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D) { MASSVEC3DPA_7; } __syncthreads(); - GPU_SHARED_DIRECT_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + GPU_SHARED_DIRECT_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_8; } __syncthreads(); @@ -249,6 +250,169 @@ void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, } // (c) dimension loop } +template +void MASSVEC3DPA::runRAJAImpl(RESOURCE &res) +{ + + MASSVEC3DPA_DATA_SETUP; + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy< + RAJA::hip_launch_t>; + + using outer_x = RAJA::LoopPolicy; + + //clang-format off + RAJA::launch( + res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + MASSVEC3DPA_0_GPU + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type d) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type q) { + MASSVEC3DPA_1; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + for (Index_type c = 0; c < 3; ++c) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dx) { + MASSVEC3DPA_2; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qx) { + MASSVEC3DPA_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qx) { + MASSVEC3DPA_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qx) { + MASSVEC3DPA_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dx) { + MASSVEC3DPA_6; + } // lambda (dx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dx) { + MASSVEC3DPA_7; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), + [&](Index_type dx) { + MASSVEC3DPA_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + } // c - dim loop + } // lambda (e) + ); // RAJA::loop + } // outer lambda (ctx) + ); // RAJA::launch + //clang-format on + +} + template void MASSVEC3DPA::runHipVariantImpl(VariantID vid) { @@ -271,11 +435,11 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + dim3 nthreads_per_block(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D); constexpr size_t shmem = 0; RPlaunchHipKernel((MassVec3DPA_BLOCKDIM_LOOP_INC), NE, - nthreads_per_block, shmem, res.get_stream(), B, Bt, D, + nthreads_per_block, shmem, res.get_stream(), B, D, X, Y); } stopTimer(); @@ -287,12 +451,12 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + dim3 nthreads_per_block(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D); constexpr size_t shmem = 0; RPlaunchHipKernel((MassVec3DPA_ARGUMENT_LOOP_INC), NE, - nthreads_per_block, shmem, res.get_stream(), B, Bt, D, - X, Y, static_cast(MVPA_Q1D)); + nthreads_per_block, shmem, res.get_stream(), B, D, + X, Y, static_cast(mvpa::Q1D)); } stopTimer(); @@ -302,11 +466,11 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + dim3 nthreads_per_block(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D); constexpr size_t shmem = 0; RPlaunchHipKernel((MassVec3DPA_COMPILE_LOOP_INC), NE, - nthreads_per_block, shmem, res.get_stream(), B, Bt, D, + nthreads_per_block, shmem, res.get_stream(), B, D, X, Y); } stopTimer(); @@ -317,11 +481,11 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + dim3 nthreads_per_block(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D); constexpr size_t shmem = 0; RPlaunchHipKernel((MassVec3DPA_DIRECT), NE, - nthreads_per_block, shmem, res.get_stream(), B, Bt, D, + nthreads_per_block, shmem, res.get_stream(), B, D, X, Y); } stopTimer(); @@ -332,10 +496,6 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid) case RAJA_HIP: { - constexpr bool async = true; - - using launch_policy = RAJA::LaunchPolicy< - RAJA::hip_launch_t>; using outer_x = RAJA::LoopPolicy; @@ -351,150 +511,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); @@ -502,161 +519,17 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid) if constexpr (tune_idx == 1) { - using inner_x = RAJA::LoopPolicy>; + using inner_x = RAJA::LoopPolicy>; - using inner_y = RAJA::LoopPolicy>; + using inner_y = RAJA::LoopPolicy>; - using inner_z = RAJA::LoopPolicy>; + using inner_z = RAJA::LoopPolicy>; startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); @@ -674,149 +547,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASSVEC3DPA-OMP.cpp b/src/apps/MASSVEC3DPA-OMP.cpp index 7d59a4c5d..1137655bf 100644 --- a/src/apps/MASSVEC3DPA-OMP.cpp +++ b/src/apps/MASSVEC3DPA-OMP.cpp @@ -40,38 +40,38 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) MASSVEC3DPA_0_CPU; - SHARED_LOOP_2D(q, d, MVPA_Q1D, MVPA_D1D) { + SHARED_LOOP_2D(q, d, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { - SHARED_LOOP_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + SHARED_LOOP_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_2; } - SHARED_LOOP_3D(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D) { + SHARED_LOOP_3D(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_3; } - SHARED_LOOP_3D(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D) { + SHARED_LOOP_3D(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_4; } - SHARED_LOOP_3D(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { + SHARED_LOOP_3D(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_5; } - SHARED_LOOP_3D(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { + SHARED_LOOP_3D(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_6; } - SHARED_LOOP_3D(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D) { + SHARED_LOOP_3D(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D) { MASSVEC3DPA_7; } - SHARED_LOOP_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + SHARED_LOOP_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_8; } @@ -103,6 +103,7 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { //Grid is empty as the host does not need a compute grid to be specified + //clang-format off RAJA::launch( res, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -116,9 +117,9 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) //Masking out of the z-dimension thread is done with GPU versions RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type q) { MASSVEC3DPA_1; } // lambda (q) @@ -130,11 +131,11 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) for (Index_type c = 0; c < 3; ++c) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dx) { MASSVEC3DPA_2; } // lambda (dx) @@ -146,11 +147,11 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qx) { MASSVEC3DPA_3; } // lambda (qx) @@ -162,11 +163,11 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qx) { MASSVEC3DPA_4; } // lambda (qx) @@ -178,11 +179,11 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qx) { MASSVEC3DPA_5; } // lambda (qx) @@ -194,11 +195,11 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dx) { MASSVEC3DPA_6; } // lambda (dx) @@ -210,11 +211,11 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dx) { MASSVEC3DPA_7; } // lambda (dx) @@ -226,11 +227,11 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dx) { MASSVEC3DPA_8; } // lambda (dx) @@ -247,6 +248,7 @@ void MASSVEC3DPA::runOpenMPVariant(VariantID vid) ); // RAJA::loop } // outer lambda (ctx) ); // // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASSVEC3DPA-Seq.cpp b/src/apps/MASSVEC3DPA-Seq.cpp index 73a03cc67..daebf2607 100644 --- a/src/apps/MASSVEC3DPA-Seq.cpp +++ b/src/apps/MASSVEC3DPA-Seq.cpp @@ -37,38 +37,38 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) MASSVEC3DPA_0_CPU; - SHARED_LOOP_2D(q, d, MVPA_Q1D, MVPA_D1D) { + SHARED_LOOP_2D(q, d, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { - SHARED_LOOP_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + SHARED_LOOP_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_2; } - SHARED_LOOP_3D(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D) { + SHARED_LOOP_3D(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_3; } - SHARED_LOOP_3D(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D) { + SHARED_LOOP_3D(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_4; } - SHARED_LOOP_3D(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { + SHARED_LOOP_3D(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_5; } - SHARED_LOOP_3D(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { + SHARED_LOOP_3D(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_6; } - SHARED_LOOP_3D(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D) { + SHARED_LOOP_3D(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D) { MASSVEC3DPA_7; } - SHARED_LOOP_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + SHARED_LOOP_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_8; } @@ -100,7 +100,7 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - // clang-format off + //clang-format off RAJA::launch(res, RAJA::LaunchParams(), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { @@ -114,9 +114,9 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) //Masking out of the z-dimension thread is done with GPU versions RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type q) { MASSVEC3DPA_1; } // lambda (q) @@ -128,11 +128,11 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) for (Index_type c = 0; c < 3; ++c) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dx) { MASSVEC3DPA_2; } // lambda (dx) @@ -144,11 +144,11 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qx) { MASSVEC3DPA_3; } // lambda (qx) @@ -160,11 +160,11 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qx) { MASSVEC3DPA_4; } // lambda (qx) @@ -176,11 +176,11 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qx) { MASSVEC3DPA_5; } // lambda (qx) @@ -192,11 +192,11 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dx) { MASSVEC3DPA_6; } // lambda (dx) @@ -208,11 +208,11 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dx) { MASSVEC3DPA_7; } // lambda (dx) @@ -224,11 +224,11 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](Index_type dx) { MASSVEC3DPA_8; } // lambda (dx) @@ -246,7 +246,7 @@ void MASSVEC3DPA::runSeqVariant(VariantID vid) } // outer lambda (ctx) ); // RAJA::launch - // clang-format on + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASSVEC3DPA-Sycl.cpp b/src/apps/MASSVEC3DPA-Sycl.cpp index 7c9e0ec76..f360467fd 100644 --- a/src/apps/MASSVEC3DPA-Sycl.cpp +++ b/src/apps/MASSVEC3DPA-Sycl.cpp @@ -35,8 +35,8 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) MASSVEC3DPA_DATA_SETUP; - const ::sycl::range<3> workGroupSize(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); - const ::sycl::range<3> gridSize(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D*NE); + const ::sycl::range<3> workGroupSize(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D); + const ::sycl::range<3> gridSize(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D*NE); switch (vid) { @@ -48,8 +48,8 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) qu->submit([&](::sycl::handler& h) { - constexpr Index_type MQ1 = MVPA_Q1D; - constexpr Index_type MD1 = MVPA_D1D; + constexpr Index_type MQ1 = mvpa::Q1D; + constexpr Index_type MD1 = mvpa::D1D; constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; auto smB_vec = ::sycl::local_accessor(::sycl::range<1>(MQ1 * MD1), h); @@ -78,44 +78,44 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; - SYCL_SHARED_LOOP_2D(q, d, MVPA_Q1D, MVPA_D1D) { + SYCL_SHARED_LOOP_2D(q, d, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_1; } for (int c = 0; c < 3; ++c) { - SYCL_SHARED_LOOP_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + SYCL_SHARED_LOOP_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_2; } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_SHARED_LOOP_3D(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D) { + SYCL_SHARED_LOOP_3D(qx, dy, dz, mvpa::Q1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_3; } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_SHARED_LOOP_3D(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D) { + SYCL_SHARED_LOOP_3D(qx, qy, dz, mvpa::Q1D, mvpa::Q1D, mvpa::D1D) { MASSVEC3DPA_4; } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_SHARED_LOOP_3D(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { + SYCL_SHARED_LOOP_3D(qx, qy, qz, mvpa::Q1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_5; } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_SHARED_LOOP_3D(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { + SYCL_SHARED_LOOP_3D(dx, qy, qz, mvpa::D1D, mvpa::Q1D, mvpa::Q1D) { MASSVEC3DPA_6; } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_SHARED_LOOP_3D(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D) { + SYCL_SHARED_LOOP_3D(dx, dy, qz, mvpa::D1D, mvpa::D1D, mvpa::Q1D) { MASSVEC3DPA_7; } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_SHARED_LOOP_3D(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D) { + SYCL_SHARED_LOOP_3D(dx, dy, dz, mvpa::D1D, mvpa::D1D, mvpa::D1D) { MASSVEC3DPA_8; } itm.barrier(::sycl::access::fence_space::local_space); @@ -148,29 +148,30 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) //Caclulate amount of shared memory needed size_t shmem = 0; { - constexpr int MQ1 = MVPA_Q1D; - constexpr int MD1 = MVPA_D1D; + constexpr int MQ1 = mvpa::Q1D; + constexpr int MD1 = mvpa::D1D; constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; constexpr int no_mats = 2; - shmem += MQ1 * MD1 * no_mats * MDQ * MDQ * MDQ * sizeof(double); + shmem += MQ1 * MD1 * no_mats * MDQ * MDQ * MDQ * sizeof(Real_type); } startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + //clang-format off RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D), shmem), + RAJA::Threads(mvpa::Q1D, mvpa::Q1D, mvpa::Q1D), shmem), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { //Redefine inside the lambda to keep consistent with base version - constexpr Index_type MQ1 = MVPA_Q1D; - constexpr Index_type MD1 = MVPA_D1D; + constexpr Index_type MQ1 = mvpa::Q1D; + constexpr Index_type MD1 = mvpa::D1D; constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; Real_ptr smB_arr = ctx.getSharedMemory(MQ1 * MD1); @@ -192,8 +193,8 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) //3 loops to remain consistent with the GPU versions //Masking out of the z-dimension thread is done with GPU versions RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](int ) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), [&](int q) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](int q) { MASSVEC3DPA_1; }); }); @@ -201,9 +202,9 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) for (int c = 0; c < 3; ++c) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dx) { MASSVEC3DPA_2; }); }); @@ -211,27 +212,27 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](int qx) { MASSVEC3DPA_3; }); }); }); ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), [&](int qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](int qx) { MASSVEC3DPA_4; }); }); }); ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), [&](int qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), [&](int qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](int qx) { MASSVEC3DPA_5; }); @@ -239,9 +240,9 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) }); ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), [&](int qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), [&](int qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dx) { MASSVEC3DPA_6; }); @@ -250,9 +251,9 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), [&](int qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::Q1D), [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dx) { MASSVEC3DPA_7; }); @@ -260,9 +261,9 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) }); ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), [&](int dx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mvpa::D1D), [&](int dx) { MASSVEC3DPA_8; }); }); @@ -277,6 +278,7 @@ void MASSVEC3DPA::runSyclVariantImpl(VariantID vid) } // outer lambda (ctx) ); // RAJA::launch + //clang-format on } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASSVEC3DPA.cpp b/src/apps/MASSVEC3DPA.cpp index 5e5716cd1..a750dd2d2 100644 --- a/src/apps/MASSVEC3DPA.cpp +++ b/src/apps/MASSVEC3DPA.cpp @@ -21,40 +21,40 @@ MASSVEC3DPA::MASSVEC3DPA(const RunParams ¶ms) : KernelBase(rajaperf::Apps_MASSVEC3DPA, params) { - const Index_type NE_initial = 15625; + const Index_type NE_initial = 5208; - setDefaultProblemSize(NE_initial * MVPA_Q1D * MVPA_Q1D * MVPA_Q1D); + setDefaultProblemSize(NE_initial * mvpa::DIM * mvpa::D1D * mvpa::D1D * mvpa::D1D); setDefaultReps(50); m_NE = - std::max((getTargetProblemSize() + (MVPA_Q1D * MVPA_Q1D * MVPA_Q1D) / 2) / - (MVPA_Q1D * MVPA_Q1D * MVPA_Q1D), + std::max((getTargetProblemSize() + (mvpa::DIM * mvpa::Q1D * mvpa::Q1D * mvpa::Q1D) / 2) / + (mvpa::DIM * mvpa::Q1D * mvpa::Q1D * mvpa::Q1D), Index_type(1)); - setActualProblemSize(m_NE * MVPA_Q1D * MVPA_Q1D * MVPA_Q1D); + setActualProblemSize(m_NE * mvpa::DIM * mvpa::Q1D * mvpa::Q1D * mvpa::Q1D); setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesReadPerRep(2 * sizeof(Real_type) * MVPA_Q1D * MVPA_D1D + // B, Bt - 3 * sizeof(Real_type) * MVPA_D1D * MVPA_D1D * MVPA_D1D * - MVPA_DIM * m_NE + // X (3 components) - 1 * sizeof(Real_type) * MVPA_Q1D * MVPA_Q1D * MVPA_Q1D * + setBytesReadPerRep(2 * sizeof(Real_type) * mvpa::Q1D * mvpa::D1D + // B, Bt + 3 * sizeof(Real_type) * mvpa::D1D * mvpa::D1D * mvpa::D1D * + mvpa::DIM * m_NE + // X (3 components) + 1 * sizeof(Real_type) * mvpa::Q1D * mvpa::Q1D * mvpa::Q1D * m_NE); // D - setBytesWrittenPerRep(3 * sizeof(Real_type) * MVPA_D1D * MVPA_D1D * MVPA_D1D * - MVPA_DIM * m_NE); // Y (3 components) + setBytesWrittenPerRep(3 * sizeof(Real_type) * mvpa::D1D * mvpa::D1D * mvpa::D1D * + mvpa::DIM * m_NE); // Y (3 components) setBytesModifyWrittenPerRep( 0 ); setBytesAtomicModifyWrittenPerRep(0); //3 for the dimension loop - setFLOPsPerRep(m_NE * MVPA_DIM * - (2 * MVPA_D1D * MVPA_Q1D * MVPA_D1D * MVPA_D1D + - 2 * MVPA_D1D * MVPA_Q1D * MVPA_Q1D * MVPA_D1D + - 2 * MVPA_D1D * MVPA_Q1D * MVPA_Q1D * MVPA_Q1D + - MVPA_Q1D * MVPA_Q1D * MVPA_Q1D + - 2 * MVPA_Q1D * MVPA_D1D * MVPA_Q1D * MVPA_Q1D + - 2 * MVPA_Q1D * MVPA_D1D * MVPA_D1D * MVPA_Q1D + - 2 * MVPA_Q1D * MVPA_D1D * MVPA_D1D * MVPA_D1D)); + setFLOPsPerRep(m_NE * mvpa::DIM * + (2 * mvpa::D1D * mvpa::Q1D * mvpa::D1D * mvpa::D1D + + 2 * mvpa::D1D * mvpa::Q1D * mvpa::Q1D * mvpa::D1D + + 2 * mvpa::D1D * mvpa::Q1D * mvpa::Q1D * mvpa::Q1D + + mvpa::Q1D * mvpa::Q1D * mvpa::Q1D + + 2 * mvpa::Q1D * mvpa::D1D * mvpa::Q1D * mvpa::Q1D + + 2 * mvpa::Q1D * mvpa::D1D * mvpa::D1D * mvpa::Q1D + + 2 * mvpa::Q1D * mvpa::D1D * mvpa::D1D * mvpa::D1D)); setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); setChecksumTolerance(ChecksumTolerance::normal); @@ -71,26 +71,24 @@ MASSVEC3DPA::~MASSVEC3DPA() {} void MASSVEC3DPA::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - allocAndInitDataConst(m_B, MVPA_Q1D * MVPA_D1D, 1.0, vid); - allocAndInitDataConst(m_Bt, MVPA_Q1D * MVPA_D1D, 1.0, vid); - allocAndInitDataConst(m_D, MVPA_Q1D * MVPA_Q1D * MVPA_Q1D * m_NE, 1.0, vid); + allocAndInitDataConst(m_B, mvpa::Q1D * mvpa::D1D, 1.0, vid); + allocAndInitDataConst(m_D, mvpa::Q1D * mvpa::Q1D * mvpa::Q1D * m_NE, 1.0, vid); - allocAndInitDataConst(m_X, MVPA_D1D * MVPA_D1D * MVPA_D1D * MVPA_DIM * m_NE, 1.0, vid); + allocAndInitDataConst(m_X, mvpa::D1D * mvpa::D1D * mvpa::D1D * mvpa::DIM * m_NE, 1.0, vid); - allocAndInitDataConst(m_Y, MVPA_D1D * MVPA_D1D * MVPA_D1D * MVPA_DIM * m_NE, 0.0, vid); + allocAndInitDataConst(m_Y, mvpa::D1D * mvpa::D1D * mvpa::D1D * mvpa::DIM * m_NE, 0.0, vid); } void MASSVEC3DPA::updateChecksum(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - addToChecksum(m_Y, MVPA_DIM * MVPA_D1D * MVPA_D1D * MVPA_D1D * m_NE, vid); + addToChecksum(m_Y, mvpa::DIM * mvpa::D1D * mvpa::D1D * mvpa::D1D * m_NE, vid); } void MASSVEC3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { deallocData(m_B, vid); - deallocData(m_Bt, vid); deallocData(m_D, vid); deallocData(m_X, vid); deallocData(m_Y, vid); diff --git a/src/apps/MASSVEC3DPA.hpp b/src/apps/MASSVEC3DPA.hpp index 71ea09f67..73a68283b 100644 --- a/src/apps/MASSVEC3DPA.hpp +++ b/src/apps/MASSVEC3DPA.hpp @@ -9,45 +9,44 @@ /// /// Action of 3D mass matrix via partial assembly on a block vector (3 blocks) /// -/// for (int e = 0; e < NE; ++e) { +/// for (Index_type e = 0; e < NE; ++e) { /// -/// double B[MQ1][MD1]; -/// double Bt[MD1][MQ1]; +/// Real_type B[MQ1][MD1]; /// -/// double sm0[MDQ * MDQ * MDQ]; -/// double sm1[MDQ * MDQ * MDQ]; -/// double(*X)[MD1][MD1] = (double(*)[MD1][MD1])sm0; -/// double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; -/// double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; -/// double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; -/// double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; -/// double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; +/// Real_type sm0[MDQ * MDQ * MDQ]; +/// Real_type sm1[MDQ * MDQ * MDQ]; +/// Real_type(*X)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; +/// Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; +/// Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; +/// Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; +/// Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; +/// Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; /// -/// for (int d = 0; d < MVPA_D1D; ++d) { -/// for (int q = 0; q < MVPA_Q1D; ++q) { -/// double basis = b(q, d); +/// for (Index_type d = 0; d < mvpa::D1D; ++d) { +/// for (Index_type q = 0; q < mvpa::Q1D; ++q) { +/// Real_type basis = b(q, d); /// B[q][d] = basis; /// Bt[d][q] = basis; /// } /// } /// -/// for (int c = 0; c < 3; ++c) { +/// for (Index_type c = 0; c < 3; ++c) { /// -/// for (int dz = 0; dz < MVPA_D1D; ++dz) { -/// for (int dy = 0; dy < MVPA_D1D; ++dy) { -/// for (int dx = 0; dx < MVPA_D1D; ++dx) { +/// for (Index_type dz = 0; dz < mvpa::D1D; ++dz) { +/// for (Index_type dy = 0; dy < mvpa::D1D; ++dy) { +/// for (Index_type dx = 0; dx < mvpa::D1D; ++dx) { /// /// smX[dz][dy][dx] = mvpaX_(dx, dy, dz, c, e); /// } /// } /// } /// -/// for (int dz = 0; dz < MVPA_D1D; ++dz) { -/// for (int dy = 0; dy < MVPA_D1D; ++dy) { -/// for (int qx = 0; qx < MVPA_Q1D; ++qx) { +/// for (Index_type dz = 0; dz < mvpa::D1D; ++dz) { +/// for (Index_type dy = 0; dy < mvpa::D1D; ++dy) { +/// for (Index_type qx = 0; qx < mvpa::Q1D; ++qx) { /// -/// double u = 0.0; -/// for (int dx = 0; dx < MVPA_D1D; ++dx) { +/// Real_type u = 0.0; +/// for (Index_type dx = 0; dx < mvpa::D1D; ++dx) { /// u += X[dz][dy][dx] * B[qx][dx]; /// } /// DDQ[dz][dy][qx] = u; @@ -55,12 +54,12 @@ /// } /// } /// -/// for (int dz = 0; dz < MVPA_D1D; ++dz) { -/// for (int qy = 0; qy < MVPA_Q1D; ++qy) { -/// for (int qx = 0; qx < MVPA_Q1D; ++qx) { +/// for (Index_type dz = 0; dz < mvpa::D1D; ++dz) { +/// for (Index_type qy = 0; qy < mvpa::Q1D; ++qy) { +/// for (Index_type qx = 0; qx < mvpa::Q1D; ++qx) { /// -/// double u = 0.0; -/// for (int dy = 0; dy < MVPA_D1D; ++dy) { +/// Real_type u = 0.0; +/// for (Index_type dy = 0; dy < mvpa::D1D; ++dy) { /// u += DDQ[dz][dy][qx] * B[qy][dy]; /// } /// DQQ[dz][qy][qx] = u; @@ -68,12 +67,12 @@ /// } /// } /// -/// for (int qz = 0; qz < MVPA_Q1D; ++qz) { -/// for (int qy = 0; qy < MVPA_Q1D; ++qy) { -/// for (int qx = 0; qx < MVPA_Q1D; ++qx) { +/// for (Index_type qz = 0; qz < mvpa::Q1D; ++qz) { +/// for (Index_type qy = 0; qy < mvpa::Q1D; ++qy) { +/// for (Index_type qx = 0; qx < mvpa::Q1D; ++qx) { /// -/// double u = 0.0; -/// for (int dz = 0; dz < MVPA_D1D; ++dz) { +/// Real_type u = 0.0; +/// for (Index_type dz = 0; dz < mvpa::D1D; ++dz) { /// u += DQQ[dz][qy][qx] * B[qz][dz]; /// } /// QQQ[qz][qy][qx] = u * D(qx, qy, qz, e); @@ -81,12 +80,12 @@ /// } /// } /// -/// for (int qz = 0; qz < MVPA_Q1D; ++qz) { -/// for (int qy = 0; qy < MVPA_Q1D; ++qy) { -/// for (int dx = 0; dx < MVPA_D1D; ++dx) { +/// for (Index_type qz = 0; qz < mvpa::Q1D; ++qz) { +/// for (Index_type qy = 0; qy < mvpa::Q1D; ++qy) { +/// for (Index_type dx = 0; dx < mvpa::D1D; ++dx) { /// -/// double u = 0.0; -/// for (int qx = 0; qx < MVPA_Q1D; ++qx) { +/// Real_type u = 0.0; +/// for (Index_type qx = 0; qx < mvpa::Q1D; ++qx) { /// u += QQQ[qz][qy][qx] * Bt[dx][qx]; /// } /// QQD[qz][qy][dx] = u; @@ -94,12 +93,12 @@ /// } /// } /// -/// for (int qz = 0; qz < MVPA_Q1D; ++qz) { -/// for (int dy = 0; dy < MVPA_D1D; ++dy) { -/// for (int dx = 0; dx < MVPA_D1D; ++dx) { +/// for (Index_type qz = 0; qz < mvpa::Q1D; ++qz) { +/// for (Index_type dy = 0; dy < mvpa::D1D; ++dy) { +/// for (Index_type dx = 0; dx < mvpa::D1D; ++dx) { /// -/// double u = 0.0; -/// for (int qy = 0; qy < MVPA_Q1D; ++qy) { +/// Real_type u = 0.0; +/// for (Index_type qy = 0; qy < mvpa::Q1D; ++qy) { /// u += QQD[qz][qy][dx] * Bt[dy][qy]; /// } /// QDD[qz][dy][dx] = u; @@ -107,12 +106,12 @@ /// } /// } /// -/// for (int dz = 0; dz < MVPA_D1D; ++dz) { -/// for (int dy = 0; dy < MVPA_D1D; ++dy) { -/// for (int dx = 0; dx < MVPA_D1D; ++dx) { +/// for (Index_type dz = 0; dz < mvpa::D1D; ++dz) { +/// for (Index_type dy = 0; dy < mvpa::D1D; ++dy) { +/// for (Index_type dx = 0; dx < mvpa::D1D; ++dx) { /// -/// double u = 0.0; -/// for (int qz = 0; qz < MVPA_Q1D; ++qz) { +/// Real_type u = 0.0; +/// for (Index_type qz = 0; qz < mvpa::Q1D; ++qz) { /// u += QDD[qz][dy][dx] * Bt[dz][qz]; /// } /// mvpaY_(dx, dy, dz, c, e) = u; @@ -128,7 +127,6 @@ #define MASSVEC3DPA_DATA_SETUP \ Real_ptr B = m_B; \ - Real_ptr Bt = m_Bt; \ Real_ptr D = m_D; \ Real_ptr X = m_X; \ Real_ptr Y = m_Y; \ @@ -140,27 +138,28 @@ #include "RAJA/RAJA.hpp" // Number of Dofs/Qpts in 1D -#define MVPA_D1D 3 -#define MVPA_Q1D 4 -#define MVPA_DIM 3 -#define MVPA_B(x, y) B[x + MVPA_Q1D * y] -#define MVPA_Bt(x, y) Bt[x + MVPA_D1D * y] +namespace mvpa { +constexpr RAJA::Index_type D1D = 3; +constexpr RAJA::Index_type Q1D = 4; +constexpr RAJA::Index_type DIM = 3; +} // namespace mvpa +#define MVPA_B(x, y) B[x + mvpa::Q1D * y] #define MVPA_X(dx, dy, dz, c, e) \ - X[dx + MVPA_D1D * dy + MVPA_D1D * MVPA_D1D * dz + \ - MVPA_D1D * MVPA_D1D * MVPA_D1D * c + \ - MVPA_D1D * MVPA_D1D * MVPA_D1D * MVPA_DIM * e] + X[dx + mvpa::D1D * dy + mvpa::D1D * mvpa::D1D * dz + \ + mvpa::D1D * mvpa::D1D * mvpa::D1D * c + \ + mvpa::D1D * mvpa::D1D * mvpa::D1D * mvpa::DIM * e] #define MVPA_Y(dx, dy, dz, c, e) \ - Y[dx + MVPA_D1D * dy + MVPA_D1D * MVPA_D1D * dz + \ - MVPA_D1D * MVPA_D1D * MVPA_D1D * c + \ - MVPA_D1D * MVPA_D1D * MVPA_D1D * MVPA_DIM * e] + Y[dx + mvpa::D1D * dy + mvpa::D1D * mvpa::D1D * dz + \ + mvpa::D1D * mvpa::D1D * mvpa::D1D * c + \ + mvpa::D1D * mvpa::D1D * mvpa::D1D * mvpa::DIM * e] #define MVPA_D(qx, qy, qz, e) \ - D[qx + MVPA_Q1D * qy + MVPA_Q1D * MVPA_Q1D * qz + \ - MVPA_Q1D * MVPA_Q1D * MVPA_Q1D * e] + D[qx + mvpa::Q1D * qy + mvpa::Q1D * mvpa::Q1D * qz + \ + mvpa::Q1D * mvpa::Q1D * mvpa::Q1D * e] #define MASSVEC3DPA_0_CPU \ - constexpr int MQ1 = MVPA_Q1D; \ - constexpr int MD1 = MVPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + constexpr Index_type MQ1 = mvpa::Q1D; \ + constexpr Index_type MD1 = mvpa::D1D; \ + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ /*RAJA_TEAM_SHARED*/ Real_type smB[MQ1][MD1]; \ /*RAJA_TEAM_SHARED*/ Real_type smBt[MD1][MQ1]; \ /*RAJA_TEAM_SHARED*/ Real_type sm0[MDQ * MDQ * MDQ]; \ @@ -173,9 +172,9 @@ Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; #define MASSVEC3DPA_0_GPU \ - constexpr int MQ1 = MVPA_Q1D; \ - constexpr int MD1 = MVPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + constexpr Index_type MQ1 = mvpa::Q1D; \ + constexpr Index_type MD1 = mvpa::D1D; \ + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ RAJA_TEAM_SHARED Real_type smB[MQ1][MD1]; \ RAJA_TEAM_SHARED Real_type smBt[MD1][MQ1]; \ RAJA_TEAM_SHARED Real_type sm0[MDQ * MDQ * MDQ]; \ @@ -194,51 +193,51 @@ #define MASSVEC3DPA_2 smX[dz][dy][dx] = MVPA_X(dx, dy, dz, c, e); -// 2 * MVPA_D1D * MVPA_Q1D * MVPA_D1D * MVPA_D1D +// 2 * mvpa::D1D * mvpa::Q1D * mvpa::D1D * mvpa::D1D #define MASSVEC3DPA_3 \ Real_type u = 0.0; \ - for (Index_type dx = 0; dx < MVPA_D1D; ++dx) { \ + for (Index_type dx = 0; dx < mvpa::D1D; ++dx) { \ u += smX[dz][dy][dx] * smB[qx][dx]; \ } \ DDQ[dz][dy][qx] = u; -// 2 * MVPA_D1D * MVPA_Q1D * MVPA_Q1D * MVPA_D1D +// 2 * mvpa::D1D * mvpa::Q1D * mvpa::Q1D * mvpa::D1D #define MASSVEC3DPA_4 \ Real_type u = 0.0; \ - for (Index_type dy = 0; dy < MVPA_D1D; ++dy) { \ + for (Index_type dy = 0; dy < mvpa::D1D; ++dy) { \ u += DDQ[dz][dy][qx] * smB[qy][dy]; \ } \ DQQ[dz][qy][qx] = u; -// 2 * MVPA_D1D * MVPA_Q1D * MVPA_Q1D * MVPA_Q1D + MVPA_Q1D * MVPA_Q1D * -// MVPA_Q1D +// 2 * mvpa::D1D * mvpa::Q1D * mvpa::Q1D * mvpa::Q1D + mvpa::Q1D * mvpa::Q1D * +// mvpa::Q1D #define MASSVEC3DPA_5 \ Real_type u = 0.0; \ - for (Index_type dz = 0; dz < MVPA_D1D; ++dz) { \ + for (Index_type dz = 0; dz < mvpa::D1D; ++dz) { \ u += DQQ[dz][qy][qx] * smB[qz][dz]; \ } \ QQQ[qz][qy][qx] = u * MVPA_D(qx, qy, qz, e); -// 2 * MVPA_Q1D * MVPA_D1D * MVPA_Q1D * MVPA_Q1D +// 2 * mvpa::Q1D * mvpa::D1D * mvpa::Q1D * mvpa::Q1D #define MASSVEC3DPA_6 \ Real_type u = 0.0; \ - for (Index_type qx = 0; qx < MVPA_Q1D; ++qx) { \ + for (Index_type qx = 0; qx < mvpa::Q1D; ++qx) { \ u += QQQ[qz][qy][qx] * smBt[dx][qx]; \ } \ QQD[qz][qy][dx] = u; -// 2 * MVPA_Q1D * MVPA_D1D * MVPA_D1D * MVPA_Q1D +// 2 * mvpa::Q1D * mvpa::D1D * mvpa::D1D * mvpa::Q1D #define MASSVEC3DPA_7 \ Real_type u = 0.0; \ - for (Index_type qy = 0; qy < MVPA_Q1D; ++qy) { \ + for (Index_type qy = 0; qy < mvpa::Q1D; ++qy) { \ u += QQD[qz][qy][dx] * smBt[dy][qy]; \ } \ QDD[qz][dy][dx] = u; -// 2 * MVPA_Q1D * MVPA_D1D * MVPA_D1D * MVPA_D1D +// 2 * mvpa::Q1D * mvpa::D1D * mvpa::D1D * mvpa::D1D #define MASSVEC3DPA_8 \ Real_type u = 0.0; \ - for (Index_type qz = 0; qz < MVPA_Q1D; ++qz) { \ + for (Index_type qz = 0; qz < mvpa::Q1D; ++qz) { \ u += QDD[qz][dy][dx] * smBt[dz][qz]; \ } \ MVPA_Y(dx, dy, dz, c, e) = u; @@ -271,15 +270,18 @@ class MASSVEC3DPA : public KernelBase { void runCudaVariantImpl(VariantID vid); template void runHipVariantImpl(VariantID vid); - template - void runSyclVariantImpl(VariantID vid); + template void runSyclVariantImpl(VariantID vid); + + template + void runRAJAImpl(RESOURCE &res); private: - static const size_t default_gpu_block_size = MVPA_Q1D * MVPA_Q1D * MVPA_Q1D; + static const size_t default_gpu_block_size = + mvpa::Q1D * mvpa::Q1D * mvpa::Q1D; using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; - Real_ptr m_Bt; Real_ptr m_D; Real_ptr m_X; Real_ptr m_Y; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 379863e9d..a526bc555 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -95,6 +95,7 @@ #include "apps/LTIMES_NOVIEW.hpp" #include "apps/MASS3DEA.hpp" #include "apps/MASS3DPA.hpp" +#include "apps/MASS3DPA_ATOMIC.hpp" #include "apps/MASSVEC3DPA.hpp" #include "apps/MATVEC_3D_STENCIL.hpp" #include "apps/NODAL_ACCUMULATION_3D.hpp" @@ -254,6 +255,7 @@ static const std::string KernelNames [] = std::string("Apps_LTIMES_NOVIEW"), std::string("Apps_MASS3DEA"), std::string("Apps_MASS3DPA"), + std::string("Apps_MASS3DPA_ATOMIC"), std::string("Apps_MASSVEC3DPA"), std::string("Apps_MATVEC_3D_STENCIL"), std::string("Apps_NODAL_ACCUMULATION_3D"), @@ -1152,6 +1154,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::MASS3DPA(run_params); break; } + case Apps_MASS3DPA_ATOMIC : { + kernel = new apps::MASS3DPA_ATOMIC(run_params); + break; + } case Apps_MASSVEC3DPA : { kernel = new apps::MASSVEC3DPA(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 3c7b5d275..331ab7fbd 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -155,6 +155,7 @@ enum KernelID { Apps_LTIMES_NOVIEW, Apps_MASS3DEA, Apps_MASS3DPA, + Apps_MASS3DPA_ATOMIC, Apps_MASSVEC3DPA, Apps_MATVEC_3D_STENCIL, Apps_NODAL_ACCUMULATION_3D, diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index df059e276..cbcb7e8d8 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -194,6 +194,13 @@ using Complex_ptr = Complex_type*; #define RAJAPERF_ATOMIC_MAX_HIP(lhs, rhs) \ ::atomicMax(&(lhs), (rhs)) +#define RAJAPERF_ATOMIC_ADD_SYCL(lhs, rhs) \ + sycl::atomic_ref, \ + sycl::memory_order::relaxed, \ + sycl::memory_scope::device, \ + sycl::access::address_space::global_space \ + > atomic_y(lhs); \ + atomic_y.fetch_add(rhs); #define RAJAPERF_ATOMIC_ADD_RAJA_SEQ(lhs, rhs) \ RAJA::atomicAdd(&(lhs), (rhs)) @@ -215,6 +222,9 @@ using Complex_ptr = Complex_type*; #define RAJAPERF_ATOMIC_MAX_RAJA_HIP(lhs, rhs) \ RAJA::atomicMax(&(lhs), (rhs)) +#define RAJAPERF_ATOMIC_ADD_RAJA_SYCL(lhs, rhs) \ + RAJA::atomicAdd(&(lhs), (rhs)) + } // closing brace for rajaperf namespace #endif // closing endif for header file include guard