diff --git a/CMakeLists.txt b/CMakeLists.txt index 9438c31cdd..a85f434940 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -328,6 +328,13 @@ install(FILES # Setup internal RAJA configuration options include(cmake/SetupRajaConfig.cmake) +if (RAJA_ENABLE_JIT) + target_include_directories(RAJA + PUBLIC + "${PROTEUS_INSTALL_DIR}/include" + ) +endif() + if(RAJA_ENABLE_TESTS) add_subdirectory(test) endif() diff --git a/cmake/SetupRajaOptions.cmake b/cmake/SetupRajaOptions.cmake index 555856e801..d0c3c87b67 100644 --- a/cmake/SetupRajaOptions.cmake +++ b/cmake/SetupRajaOptions.cmake @@ -35,6 +35,7 @@ option(RAJA_ENABLE_RUNTIME_PLUGINS "Enable support for loading plugins at runtim option(RAJA_ALLOW_INCONSISTENT_OPTIONS "Enable inconsistent values for ENABLE_X and RAJA_ENABLE_X options" Off) option(RAJA_ENABLE_DESUL_ATOMICS "Enable support of desul atomics" Off) +option(RAJA_ENABLE_JIT "Enable JIT compilation for RAJA kernels" Off) set(DESUL_ENABLE_TESTS Off CACHE BOOL "") set(TEST_DRIVER "" CACHE STRING "driver used to wrap test commands") diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index e09b88151c..e8805b53a6 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -75,6 +75,10 @@ #include "RAJA/policy/hip.hpp" #endif +#if defined(RAJA_ENABLE_JIT) +#include "proteus/JitInterface.hpp" +#endif + #if defined(RAJA_ENABLE_SYCL) #include "RAJA/policy/sycl.hpp" #endif diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index 36e4836378..d718c2d3e2 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -182,6 +182,7 @@ static_assert(RAJA_HAS_SOME_CXX14, #cmakedefine RAJA_ENABLE_CLANG_CUDA #cmakedefine RAJA_ENABLE_HIP #cmakedefine RAJA_ENABLE_SYCL +#cmakedefine RAJA_ENABLE_JIT #cmakedefine RAJA_ENABLE_OMP_TASK #cmakedefine RAJA_ALLOW_OPENMP_5_1_ATOMICS diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp index 2af88ca941..02b824c073 100644 --- a/include/RAJA/policy/cuda/forall.hpp +++ b/include/RAJA/policy/cuda/forall.hpp @@ -34,6 +34,7 @@ #include "RAJA/util/macros.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/Jit.hpp" #include "RAJA/internal/fault_tolerance.hpp" @@ -381,10 +382,11 @@ template 0), size_t> BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, BlocksPerSM) __global__ - void forallp_cuda_kernel(const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body, - const RAJA_CUDA_GRID_CONSTANT Iterator idx, - const RAJA_CUDA_GRID_CONSTANT IndexType length, - ForallParam f_params) + RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel( + const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body, + const RAJA_CUDA_GRID_CONSTANT Iterator idx, + const RAJA_CUDA_GRID_CONSTANT IndexType length, + ForallParam f_params) { using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(loop_body); @@ -412,7 +414,7 @@ template::value && (IterationGetter::block_size <= 0), size_t> RAJA_UNUSED_ARG(BlockSize) = 0> -__global__ void forallp_cuda_kernel( +__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel( const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body, const RAJA_CUDA_GRID_CONSTANT Iterator idx, const RAJA_CUDA_GRID_CONSTANT IndexType length, @@ -448,10 +450,11 @@ template< (IterationGetter::block_size > 0), size_t> BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, BlocksPerSM) __global__ - void forallp_cuda_kernel(const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body, - const RAJA_CUDA_GRID_CONSTANT Iterator idx, - const RAJA_CUDA_GRID_CONSTANT IndexType length, - ForallParam f_params) + RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel( + const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body, + const RAJA_CUDA_GRID_CONSTANT Iterator idx, + const RAJA_CUDA_GRID_CONSTANT IndexType length, + ForallParam f_params) { using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(loop_body); @@ -482,7 +485,7 @@ template< IterationMapping>::value && (IterationGetter::block_size <= 0), size_t> RAJA_UNUSED_ARG(BlockSize) = 0> -__global__ void forallp_cuda_kernel( +__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel( const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body, const RAJA_CUDA_GRID_CONSTANT Iterator idx, const RAJA_CUDA_GRID_CONSTANT IndexType length, @@ -554,7 +557,7 @@ forall_impl(resources::Cuda cuda_res, // Only launch kernel if we have something to iterate over if (len > 0) { - + RAJA::register_lambda(loop_body); auto func = reinterpret_cast( &impl::forallp_cuda_kernel>); @@ -642,6 +645,7 @@ RAJA_INLINE resources::EventProxy forall_impl( LoopBody&& loop_body) { int num_seg = iset.getNumSegments(); + RAJA::register_lambda(loop_body); for (int isi = 0; isi < num_seg; ++isi) { iset.segmentCall( diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp index 40c46554db..e70dabd08e 100644 --- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp +++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp @@ -30,6 +30,7 @@ #include "RAJA/util/macros.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/Jit.hpp" #include "RAJA/pattern/kernel.hpp" #include "RAJA/pattern/kernel/For.hpp" @@ -211,7 +212,8 @@ namespace internal * CUDA global function for launching CudaKernel policies */ template -__global__ void CudaKernelLauncher(const RAJA_CUDA_GRID_CONSTANT Data data) +__global__ RAJA_JIT_COMPILE void CudaKernelLauncher( + const RAJA_CUDA_GRID_CONSTANT Data data) { using data_t = camp::decay; @@ -231,7 +233,7 @@ __global__ void CudaKernelLauncher(const RAJA_CUDA_GRID_CONSTANT Data data) * This launcher is used by the CudaKerelFixed policies. */ template -__launch_bounds__(BlockSize, BlocksPerSM) __global__ +__launch_bounds__(BlockSize, BlocksPerSM) __global__ RAJA_JIT_COMPILE void CudaKernelLauncherFixed(const RAJA_CUDA_GRID_CONSTANT Data data) { @@ -681,7 +683,7 @@ struct StatementExecutor< auto cuda_data = RAJA::cuda::make_launch_body( func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data); - + RAJA::register_lambda(func); // // Launch the kernel // diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index d9cca09216..c26bb0682b 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -24,14 +24,15 @@ #include "RAJA/policy/cuda/MemUtils_CUDA.hpp" #include "RAJA/policy/cuda/raja_cudaerrchk.hpp" #include "RAJA/util/resource.hpp" +#include "RAJA/util/Jit.hpp" namespace RAJA { template -__global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY - body_in, - ReduceParams reduce_params) +__global__ RAJA_JIT_COMPILE void launch_new_reduce_global_fcn( + const RAJA_CUDA_GRID_CONSTANT BODY body_in, + ReduceParams reduce_params) { LaunchContext ctx; @@ -95,7 +96,7 @@ struct LaunchExecute< if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) { - + RAJA::register_lambda(body_in); RAJA_FT_BEGIN; size_t shared_mem_size = launch_params.shared_mem_size; @@ -138,7 +139,7 @@ template -__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ +__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ RAJA_JIT_COMPILE void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in, ReduceParams reduce_params) @@ -206,7 +207,7 @@ struct LaunchExecute< if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) { - + RAJA::register_lambda(body_in); RAJA_FT_BEGIN; size_t shared_mem_size = launch_params.shared_mem_size; diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp index 6f098a9a11..ad899a5380 100644 --- a/include/RAJA/policy/hip/forall.hpp +++ b/include/RAJA/policy/hip/forall.hpp @@ -35,6 +35,7 @@ #include "RAJA/util/macros.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/Jit.hpp" #include "RAJA/internal/fault_tolerance.hpp" @@ -374,10 +375,10 @@ template 0), size_t> BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, 1) __global__ - void forallp_hip_kernel(const LOOP_BODY loop_body, - const Iterator idx, - const IndexType length, - ForallParam f_params) + RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel(const LOOP_BODY loop_body, + const Iterator idx, + const IndexType length, + ForallParam f_params) { using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(loop_body); @@ -403,10 +404,11 @@ template::value && (IterationGetter::block_size <= 0), size_t> RAJA_UNUSED_ARG(BlockSize) = 0> -__global__ void forallp_hip_kernel(const LOOP_BODY loop_body, - const Iterator idx, - const IndexType length, - ForallParam f_params) +__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel( + const LOOP_BODY loop_body, + const Iterator idx, + const IndexType length, + ForallParam f_params) { using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(loop_body); @@ -435,10 +437,10 @@ template< (IterationGetter::block_size > 0), size_t> BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, 1) __global__ - void forallp_hip_kernel(const LOOP_BODY loop_body, - const Iterator idx, - const IndexType length, - ForallParam f_params) + RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel(const LOOP_BODY loop_body, + const Iterator idx, + const IndexType length, + ForallParam f_params) { using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(loop_body); @@ -466,10 +468,11 @@ template< IterationMapping>::value && (IterationGetter::block_size <= 0), size_t> RAJA_UNUSED_ARG(BlockSize) = 0> -__global__ void forallp_hip_kernel(const LOOP_BODY loop_body, - const Iterator idx, - const IndexType length, - ForallParam f_params) +__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel( + const LOOP_BODY loop_body, + const Iterator idx, + const IndexType length, + ForallParam f_params) { using RAJA::internal::thread_privatize; auto privatizer = thread_privatize(loop_body); @@ -536,7 +539,7 @@ forall_impl(resources::Hip hip_res, // Only launch kernel if we have something to iterate over if (len > 0) { - + RAJA::register_lambda(loop_body); auto func = reinterpret_cast( &impl::forallp_hip_kernel>); @@ -620,6 +623,7 @@ RAJA_INLINE resources::EventProxy forall_impl( const TypedIndexSet& iset, LoopBody&& loop_body) { + RAJA::register_lambda(loop_body); int num_seg = iset.getNumSegments(); for (int isi = 0; isi < num_seg; ++isi) { diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp index 3c3589432d..4e8e8d6ec4 100644 --- a/include/RAJA/policy/hip/kernel/HipKernel.hpp +++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp @@ -30,6 +30,7 @@ #include "RAJA/util/macros.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/Jit.hpp" #include "RAJA/pattern/kernel.hpp" #include "RAJA/pattern/kernel/For.hpp" @@ -179,7 +180,7 @@ namespace internal * HIP global function for launching HipKernel policies */ template -__global__ void HipKernelLauncher(const Data data) +__global__ RAJA_JIT_COMPILE void HipKernelLauncher(const Data data) { using data_t = camp::decay; @@ -198,7 +199,7 @@ __global__ void HipKernelLauncher(const Data data) * This launcher is used by the HipKerelFixed policies. */ template -__launch_bounds__(BlockSize, 1) __global__ +__launch_bounds__(BlockSize, 1) __global__ RAJA_JIT_COMPILE void HipKernelLauncherFixed(const Data data) { @@ -639,7 +640,7 @@ struct StatementExecutor< auto hip_data = RAJA::hip::make_launch_body( func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data); - + RAJA::register_lambda(func); // // Launch the kernel // diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index f3ae8f87c1..27c33c08b3 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -24,13 +24,15 @@ #include "RAJA/policy/hip/MemUtils_HIP.hpp" #include "RAJA/policy/hip/raja_hiperrchk.hpp" #include "RAJA/util/resource.hpp" +#include "RAJA/util/Jit.hpp" namespace RAJA { template -__global__ void launch_new_reduce_global_fcn(const BODY body_in, - ReduceParams reduce_params) +__global__ RAJA_JIT_COMPILE void launch_new_reduce_global_fcn( + const BODY body_in, + ReduceParams reduce_params) { LaunchContext ctx; @@ -92,7 +94,7 @@ struct LaunchExecute< if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero && blockSize.x > zero && blockSize.y > zero && blockSize.z > zero) { - + RAJA::register_lambda(body_in); RAJA_FT_BEGIN; size_t shared_mem_size = launch_params.shared_mem_size; @@ -133,7 +135,7 @@ struct LaunchExecute< }; template -__launch_bounds__(num_threads, 1) __global__ +__launch_bounds__(num_threads, 1) __global__ RAJA_JIT_COMPILE void launch_new_reduce_global_fcn_fixed(const BODY body_in, ReduceParams reduce_params) { @@ -210,7 +212,7 @@ struct LaunchExecute> launch_info.res = hip_res; { - + RAJA::register_lambda(body_in); RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers, launch_info); diff --git a/include/RAJA/util/Jit.hpp b/include/RAJA/util/Jit.hpp new file mode 100644 index 0000000000..33df0d746c --- /dev/null +++ b/include/RAJA/util/Jit.hpp @@ -0,0 +1,34 @@ +#include "RAJA/config.hpp" +#if defined(RAJA_ENABLE_JIT) +#include "proteus/JitInterface.hpp" +#endif + +#ifndef RAJA_jit_HPP +#define RAJA_jit_HPP + +namespace RAJA +{ +template +inline auto register_lambda(Lambda&& lambda) +{ +#if defined RAJA_ENABLE_JIT + return proteus::register_lambda(std::forward(lambda)); +#else + return std::forward(lambda); +#endif +} + +template +inline auto jit_variable(T arg) +{ +#if defined RAJA_ENABLE_JIT + return proteus::jit_variable(std::forward(arg)); +#else + return std::forward(arg); +#endif +} + + +} // namespace RAJA + +#endif diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp index a41179c1a3..7c2702fcc8 100644 --- a/include/RAJA/util/macros.hpp +++ b/include/RAJA/util/macros.hpp @@ -224,4 +224,12 @@ inline void RAJA_ABORT_OR_THROW(const char* str) #endif +#if defined RAJA_ENABLE_JIT +#define RAJA_JIT_COMPILE_ARGS(...) __attribute__((annotate("jit", __VA_ARGS__))) +#define RAJA_JIT_COMPILE __attribute__((annotate("jit"))) +#else +#define RAJA_JIT_COMPILE_ARGS(...) +#define RAJA_JIT_COMPILE +#endif + #endif /* RAJA_INTERNAL_MACROS_HPP */