Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
b8ed6ee
initial commit for all fem updates
artv3 Dec 26, 2025
e366158
add mfem-v4.9 draft kernel
artv3 Dec 26, 2025
3cb54d8
update diffusion ref impl
artv3 Dec 26, 2025
63562b1
update base impl for diffusion
artv3 Dec 26, 2025
e659836
update seq variant
artv3 Dec 26, 2025
1c046eb
update pass
artv3 Dec 26, 2025
b3c283a
update diffusion kernel to mfem v4.9
artv3 Dec 26, 2025
d027da1
use real_type and index_type
artv3 Dec 26, 2025
27b2090
MASS3DPA.hpp
artv3 Dec 26, 2025
bece84d
validate mfem assemble MASS3DEA kernel
artv3 Dec 26, 2025
ef56fa1
first cut at MASS3DPA with atomics
artv3 Jan 5, 2026
a415054
clean up sequential variant, add raja impl
artv3 Jan 5, 2026
cb911cf
push up variants of the atomic mass kernel
artv3 Jan 5, 2026
45f0d4d
update MASS ATOMIC
artv3 Jan 6, 2026
2ae809b
Update src/apps/MASS3DPA_ATOMIC.cpp
artv3 Jan 6, 2026
a6776ef
Merge branch 'develop' into artv3/fem-updates
artv3 Jan 6, 2026
f63a428
Merge branch 'develop' into artv3/fem-updates
artv3 Jan 6, 2026
3765620
fix target problem setup
artv3 Jan 6, 2026
ad8ea04
add sycl mass atomic kernel and fix diffusion
artv3 Jan 6, 2026
75f192e
clean up pass
artv3 Jan 6, 2026
ed0cd0e
clean up pass for convection
artv3 Jan 6, 2026
f38052f
update diffusion FOM
artv3 Jan 6, 2026
f8cf871
clean up pass
artv3 Jan 6, 2026
f8281d9
update mass ea
artv3 Jan 6, 2026
9544930
update MASS EA, VEC, and remove unused var
artv3 Jan 6, 2026
b23de21
remove unused var
artv3 Jan 6, 2026
d41fad5
ready for review
artv3 Jan 6, 2026
aa112c3
Merge branch 'develop' into artv3/fem-updates
artv3 Jan 7, 2026
90a63d8
merge with develop
artv3 Jan 7, 2026
2f8ce64
consitency with type alias
artv3 Jan 7, 2026
d2ca5d8
clean up pass
artv3 Jan 8, 2026
a7d7183
clang format off on launch kernels
artv3 Jan 8, 2026
db8bb4a
minor formatting
artv3 Jan 8, 2026
78d34fc
remove unused helper
artv3 Jan 8, 2026
5464a26
clean up pass
artv3 Jan 8, 2026
65da788
Merge branch 'develop' into artv3/fem-updates
artv3 Jan 8, 2026
4b6ef23
add missing macros
artv3 Jan 8, 2026
92ba026
Merge branch 'artv3/fem-updates' of github.com:llnl/RAJAPerf into art…
artv3 Jan 8, 2026
45907df
clean up pass
artv3 Jan 8, 2026
ddec26f
use macro for sycl atomic
artv3 Jan 8, 2026
b96bd85
Update src/common/RPTypes.hpp
artv3 Jan 8, 2026
6d85660
restart CI
artv3 Jan 9, 2026
6c039e7
Merge branch 'develop' into artv3/fem-updates
artv3 Jan 9, 2026
16dbbfd
Merge branch 'develop' into artv3/fem-updates
rhornung67 Jan 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ blt_add_executable(
apps/MASS3DEA-Seq.cpp
apps/MASS3DPA.cpp
apps/MASS3DPA-Seq.cpp
apps/MASS3DPA_ATOMIC.cpp
apps/MASS3DPA_ATOMIC-Seq.cpp
apps/MASSVEC3DPA.cpp
apps/MASSVEC3DPA-Seq.cpp
apps/MATVEC_3D_STENCIL.cpp
Expand Down
6 changes: 6 additions & 0 deletions src/apps/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@ blt_add_library(
MASS3DPA-Seq.cpp
MASS3DPA-OMP.cpp
MASS3DPA-Sycl.cpp
MASS3DPA_ATOMIC.cpp
MASS3DPA_ATOMIC-Cuda.cpp
MASS3DPA_ATOMIC-Hip.cpp
MASS3DPA_ATOMIC-Seq.cpp
MASS3DPA_ATOMIC-OMP.cpp
MASS3DPA_ATOMIC-Sycl.cpp
MASSVEC3DPA.cpp
MASSVEC3DPA-Cuda.cpp
MASSVEC3DPA-Hip.cpp
Expand Down
112 changes: 57 additions & 55 deletions src/apps/CONVECTION3DPA-Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace rajaperf {
namespace apps {

template < size_t block_size >
__launch_bounds__(block_size)
__launch_bounds__(block_size)
__global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis,
const Real_ptr dBasis, const Real_ptr D,
const Real_ptr X, Real_ptr Y) {
Expand All @@ -29,95 +29,95 @@ __global__ void Convection3DPA(const Real_ptr Basis, const Real_ptr tBasis,

CONVECTION3DPA_0_GPU;

GPU_FOREACH_THREAD(dz,z,CPA_D1D)
GPU_FOREACH_THREAD(dz,z,conv::D1D)
{
GPU_FOREACH_THREAD(dy,y,CPA_D1D)
GPU_FOREACH_THREAD(dy,y,conv::D1D)
{
GPU_FOREACH_THREAD(dx,x,CPA_D1D)
GPU_FOREACH_THREAD(dx,x,conv::D1D)
{
CONVECTION3DPA_1;
}
}
}
__syncthreads();

GPU_FOREACH_THREAD(dz,z,CPA_D1D)
GPU_FOREACH_THREAD(dz,z,conv::D1D)
{
GPU_FOREACH_THREAD(dy,y,CPA_D1D)
GPU_FOREACH_THREAD(dy,y,conv::D1D)
{
GPU_FOREACH_THREAD(qx,x,CPA_Q1D)
GPU_FOREACH_THREAD(qx,x,conv::Q1D)
{
CONVECTION3DPA_2;
}
}
}
__syncthreads();

GPU_FOREACH_THREAD(dz,z,CPA_D1D)
GPU_FOREACH_THREAD(dz,z,conv::D1D)
{
GPU_FOREACH_THREAD(qx,x,CPA_Q1D)
GPU_FOREACH_THREAD(qx,x,conv::Q1D)
{
GPU_FOREACH_THREAD(qy,y,CPA_Q1D)
GPU_FOREACH_THREAD(qy,y,conv::Q1D)
{
CONVECTION3DPA_3;
}
}
}
__syncthreads();

GPU_FOREACH_THREAD(qx,x,CPA_Q1D)
GPU_FOREACH_THREAD(qx,x,conv::Q1D)
{
GPU_FOREACH_THREAD(qy,y,CPA_Q1D)
GPU_FOREACH_THREAD(qy,y,conv::Q1D)
{
GPU_FOREACH_THREAD(qz,z,CPA_Q1D)
GPU_FOREACH_THREAD(qz,z,conv::Q1D)
{
CONVECTION3DPA_4;
}
}
}
__syncthreads();

GPU_FOREACH_THREAD(qz,z,CPA_Q1D)
GPU_FOREACH_THREAD(qz,z,conv::Q1D)
{
GPU_FOREACH_THREAD(qy,y,CPA_Q1D)
GPU_FOREACH_THREAD(qy,y,conv::Q1D)
{
GPU_FOREACH_THREAD(qx,x,CPA_Q1D)
GPU_FOREACH_THREAD(qx,x,conv::Q1D)
{
CONVECTION3DPA_5;
}
}
}
__syncthreads();

GPU_FOREACH_THREAD(qx,x,CPA_Q1D)
GPU_FOREACH_THREAD(qx,x,conv::Q1D)
{
GPU_FOREACH_THREAD(qy,y,CPA_Q1D)
GPU_FOREACH_THREAD(qy,y,conv::Q1D)
{
GPU_FOREACH_THREAD(dz,z,CPA_D1D)
GPU_FOREACH_THREAD(dz,z,conv::D1D)
{
CONVECTION3DPA_6;
}
}
}
__syncthreads();

GPU_FOREACH_THREAD(dz,z,CPA_D1D)
GPU_FOREACH_THREAD(dz,z,conv::D1D)
{
GPU_FOREACH_THREAD(qx,x,CPA_Q1D)
GPU_FOREACH_THREAD(qx,x,conv::Q1D)
{
GPU_FOREACH_THREAD(dy,y,CPA_D1D)
GPU_FOREACH_THREAD(dy,y,conv::D1D)
{
CONVECTION3DPA_7;
}
}
}
__syncthreads();

GPU_FOREACH_THREAD(dz,z,CPA_D1D)
GPU_FOREACH_THREAD(dz,z,conv::D1D)
{
GPU_FOREACH_THREAD(dy,y,CPA_D1D)
GPU_FOREACH_THREAD(dy,y,conv::D1D)
{
GPU_FOREACH_THREAD(dx,x,CPA_D1D)
GPU_FOREACH_THREAD(dx,x,conv::D1D)
{
CONVECTION3DPA_8;
}
Expand All @@ -144,7 +144,7 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {
// Loop counter increment uses macro to quiet C++20 compiler warning
for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {

dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D);
dim3 nthreads_per_block(conv::Q1D, conv::Q1D, conv::Q1D);
constexpr size_t shmem = 0;

RPlaunchCudaKernel( (Convection3DPA<block_size>),
Expand All @@ -162,39 +162,40 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {
constexpr bool async = true;

using launch_policy =
RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, CPA_Q1D*CPA_Q1D*CPA_Q1D>>;
RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, conv::Q1D*conv::Q1D*conv::Q1D>>;

using outer_x =
RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;

using inner_x =
RAJA::LoopPolicy<RAJA::cuda_thread_size_x_loop<CPA_Q1D>>;
RAJA::LoopPolicy<RAJA::cuda_thread_size_x_loop<conv::Q1D>>;

using inner_y =
RAJA::LoopPolicy<RAJA::cuda_thread_size_y_loop<CPA_Q1D>>;
RAJA::LoopPolicy<RAJA::cuda_thread_size_y_loop<conv::Q1D>>;

using inner_z =
RAJA::LoopPolicy<RAJA::cuda_thread_size_z_loop<CPA_Q1D>>;
RAJA::LoopPolicy<RAJA::cuda_thread_size_z_loop<conv::Q1D>>;

startTimer();
// Loop counter increment uses macro to quiet C++20 compiler warning
for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {

//clang-format off
RAJA::launch<launch_policy>( res,
RAJA::LaunchParams(RAJA::Teams(NE),
RAJA::Threads(CPA_Q1D, CPA_Q1D, CPA_Q1D)),
RAJA::Threads(conv::Q1D, conv::Q1D, conv::Q1D)),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {

RAJA::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
[&](Index_type e) {

CONVECTION3DPA_0_GPU;

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dz) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dx) {

CONVECTION3DPA_1;
Expand All @@ -208,11 +209,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {

ctx.teamSync();

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dz) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qx) {

CONVECTION3DPA_2;
Expand All @@ -226,11 +227,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {

ctx.teamSync();

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dz) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qx) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qy) {

CONVECTION3DPA_3;
Expand All @@ -244,11 +245,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {

ctx.teamSync();

RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qx) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qy) {
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qz) {

CONVECTION3DPA_4;
Expand All @@ -262,11 +263,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {

ctx.teamSync();

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qz) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qx) {

CONVECTION3DPA_5;
Expand All @@ -280,11 +281,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {

ctx.teamSync();

RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qx) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qy) {
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dz) {

CONVECTION3DPA_6;
Expand All @@ -298,11 +299,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {

ctx.teamSync();

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dz) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_Q1D),
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, conv::Q1D),
[&](Index_type qx) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dy) {

CONVECTION3DPA_7;
Expand All @@ -316,11 +317,11 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {

ctx.teamSync();

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dz) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, CPA_D1D),
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, conv::D1D),
[&](Index_type dx) {

CONVECTION3DPA_8;
Expand All @@ -337,6 +338,7 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) {

} // outer lambda (ctx)
); // RAJA::launch
//clang-format on

} // loop over kernel reps
stopTimer();
Expand Down
Loading