Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,13 @@ install(FILES
# Setup internal RAJA configuration options
include(cmake/SetupRajaConfig.cmake)

if (RAJA_ENABLE_JIT)
target_include_directories(RAJA
PUBLIC
"${PROTEUS_INSTALL_DIR}/include"
)
endif()

if(RAJA_ENABLE_TESTS)
add_subdirectory(test)
endif()
Expand Down
1 change: 1 addition & 0 deletions cmake/SetupRajaOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ option(RAJA_ENABLE_RUNTIME_PLUGINS "Enable support for loading plugins at runtim
option(RAJA_ALLOW_INCONSISTENT_OPTIONS "Enable inconsistent values for ENABLE_X and RAJA_ENABLE_X options" Off)

option(RAJA_ENABLE_DESUL_ATOMICS "Enable support of desul atomics" Off)
option(RAJA_ENABLE_JIT "Enable JIT compilation for RAJA kernels" Off)
set(DESUL_ENABLE_TESTS Off CACHE BOOL "")

set(TEST_DRIVER "" CACHE STRING "driver used to wrap test commands")
4 changes: 4 additions & 0 deletions include/RAJA/RAJA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@
#include "RAJA/policy/hip.hpp"
#endif

#if defined(RAJA_ENABLE_JIT)
#include "proteus/JitInterface.hpp"
#endif

#if defined(RAJA_ENABLE_SYCL)
#include "RAJA/policy/sycl.hpp"
#endif
Expand Down
1 change: 1 addition & 0 deletions include/RAJA/config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ static_assert(RAJA_HAS_SOME_CXX14,
#cmakedefine RAJA_ENABLE_CLANG_CUDA
#cmakedefine RAJA_ENABLE_HIP
#cmakedefine RAJA_ENABLE_SYCL
#cmakedefine RAJA_ENABLE_JIT

#cmakedefine RAJA_ENABLE_OMP_TASK
#cmakedefine RAJA_ALLOW_OPENMP_5_1_ATOMICS
Expand Down
26 changes: 15 additions & 11 deletions include/RAJA/policy/cuda/forall.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

#include "RAJA/util/macros.hpp"
#include "RAJA/util/types.hpp"
#include "RAJA/util/Jit.hpp"

#include "RAJA/internal/fault_tolerance.hpp"

Expand Down Expand Up @@ -381,10 +382,11 @@ template<typename EXEC_POL,
(IterationGetter::block_size > 0),
size_t> BlockSize = IterationGetter::block_size>
__launch_bounds__(BlockSize, BlocksPerSM) __global__
void forallp_cuda_kernel(const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
const RAJA_CUDA_GRID_CONSTANT Iterator idx,
const RAJA_CUDA_GRID_CONSTANT IndexType length,
ForallParam f_params)
RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel(
const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
const RAJA_CUDA_GRID_CONSTANT Iterator idx,
const RAJA_CUDA_GRID_CONSTANT IndexType length,
ForallParam f_params)
{
using RAJA::internal::thread_privatize;
auto privatizer = thread_privatize(loop_body);
Expand Down Expand Up @@ -412,7 +414,7 @@ template<typename EXEC_POL,
IterationMapping>::value &&
(IterationGetter::block_size <= 0),
size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
__global__ void forallp_cuda_kernel(
__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel(
const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
const RAJA_CUDA_GRID_CONSTANT Iterator idx,
const RAJA_CUDA_GRID_CONSTANT IndexType length,
Expand Down Expand Up @@ -448,10 +450,11 @@ template<
(IterationGetter::block_size > 0),
size_t> BlockSize = IterationGetter::block_size>
__launch_bounds__(BlockSize, BlocksPerSM) __global__
void forallp_cuda_kernel(const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
const RAJA_CUDA_GRID_CONSTANT Iterator idx,
const RAJA_CUDA_GRID_CONSTANT IndexType length,
ForallParam f_params)
RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel(
const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
const RAJA_CUDA_GRID_CONSTANT Iterator idx,
const RAJA_CUDA_GRID_CONSTANT IndexType length,
ForallParam f_params)
{
using RAJA::internal::thread_privatize;
auto privatizer = thread_privatize(loop_body);
Expand Down Expand Up @@ -482,7 +485,7 @@ template<
IterationMapping>::value &&
(IterationGetter::block_size <= 0),
size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
__global__ void forallp_cuda_kernel(
__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_cuda_kernel(
const RAJA_CUDA_GRID_CONSTANT LOOP_BODY loop_body,
const RAJA_CUDA_GRID_CONSTANT Iterator idx,
const RAJA_CUDA_GRID_CONSTANT IndexType length,
Expand Down Expand Up @@ -554,7 +557,7 @@ forall_impl(resources::Cuda cuda_res,
// Only launch kernel if we have something to iterate over
if (len > 0)
{

RAJA::register_lambda(loop_body);
auto func = reinterpret_cast<const void*>(
&impl::forallp_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
IndexType, camp::decay<ForallParam>>);
Expand Down Expand Up @@ -642,6 +645,7 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
LoopBody&& loop_body)
{
int num_seg = iset.getNumSegments();
RAJA::register_lambda(loop_body);
for (int isi = 0; isi < num_seg; ++isi)
{
iset.segmentCall(
Expand Down
8 changes: 5 additions & 3 deletions include/RAJA/policy/cuda/kernel/CudaKernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

#include "RAJA/util/macros.hpp"
#include "RAJA/util/types.hpp"
#include "RAJA/util/Jit.hpp"

#include "RAJA/pattern/kernel.hpp"
#include "RAJA/pattern/kernel/For.hpp"
Expand Down Expand Up @@ -211,7 +212,8 @@ namespace internal
* CUDA global function for launching CudaKernel policies
*/
template<typename Data, typename Exec>
__global__ void CudaKernelLauncher(const RAJA_CUDA_GRID_CONSTANT Data data)
__global__ RAJA_JIT_COMPILE void CudaKernelLauncher(
const RAJA_CUDA_GRID_CONSTANT Data data)
{

using data_t = camp::decay<Data>;
Expand All @@ -231,7 +233,7 @@ __global__ void CudaKernelLauncher(const RAJA_CUDA_GRID_CONSTANT Data data)
* This launcher is used by the CudaKerelFixed policies.
*/
template<int BlockSize, int BlocksPerSM, typename Data, typename Exec>
__launch_bounds__(BlockSize, BlocksPerSM) __global__
__launch_bounds__(BlockSize, BlocksPerSM) __global__ RAJA_JIT_COMPILE
void CudaKernelLauncherFixed(const RAJA_CUDA_GRID_CONSTANT Data data)
{

Expand Down Expand Up @@ -681,7 +683,7 @@ struct StatementExecutor<
auto cuda_data = RAJA::cuda::make_launch_body(
func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
data);

RAJA::register_lambda(func);
//
// Launch the kernel
//
Expand Down
13 changes: 7 additions & 6 deletions include/RAJA/policy/cuda/launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@
#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
#include "RAJA/util/resource.hpp"
#include "RAJA/util/Jit.hpp"

namespace RAJA
{

template<typename BODY, typename ReduceParams>
__global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY
body_in,
ReduceParams reduce_params)
__global__ RAJA_JIT_COMPILE void launch_new_reduce_global_fcn(
const RAJA_CUDA_GRID_CONSTANT BODY body_in,
ReduceParams reduce_params)
{
LaunchContext ctx;

Expand Down Expand Up @@ -95,7 +96,7 @@ struct LaunchExecute<
if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
{

RAJA::register_lambda(body_in);
RAJA_FT_BEGIN;

size_t shared_mem_size = launch_params.shared_mem_size;
Expand Down Expand Up @@ -138,7 +139,7 @@ template<typename BODY,
int num_threads,
size_t BLOCKS_PER_SM,
typename ReduceParams>
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__ RAJA_JIT_COMPILE
void launch_new_reduce_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY
body_in,
ReduceParams reduce_params)
Expand Down Expand Up @@ -206,7 +207,7 @@ struct LaunchExecute<
if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
{

RAJA::register_lambda(body_in);
RAJA_FT_BEGIN;

size_t shared_mem_size = launch_params.shared_mem_size;
Expand Down
38 changes: 21 additions & 17 deletions include/RAJA/policy/hip/forall.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

#include "RAJA/util/macros.hpp"
#include "RAJA/util/types.hpp"
#include "RAJA/util/Jit.hpp"

#include "RAJA/internal/fault_tolerance.hpp"

Expand Down Expand Up @@ -374,10 +375,10 @@ template<typename EXEC_POL,
(IterationGetter::block_size > 0),
size_t> BlockSize = IterationGetter::block_size>
__launch_bounds__(BlockSize, 1) __global__
void forallp_hip_kernel(const LOOP_BODY loop_body,
const Iterator idx,
const IndexType length,
ForallParam f_params)
RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel(const LOOP_BODY loop_body,
const Iterator idx,
const IndexType length,
ForallParam f_params)
{
using RAJA::internal::thread_privatize;
auto privatizer = thread_privatize(loop_body);
Expand All @@ -403,10 +404,11 @@ template<typename EXEC_POL,
IterationMapping>::value &&
(IterationGetter::block_size <= 0),
size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
__global__ void forallp_hip_kernel(const LOOP_BODY loop_body,
const Iterator idx,
const IndexType length,
ForallParam f_params)
__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel(
const LOOP_BODY loop_body,
const Iterator idx,
const IndexType length,
ForallParam f_params)
{
using RAJA::internal::thread_privatize;
auto privatizer = thread_privatize(loop_body);
Expand Down Expand Up @@ -435,10 +437,10 @@ template<
(IterationGetter::block_size > 0),
size_t> BlockSize = IterationGetter::block_size>
__launch_bounds__(BlockSize, 1) __global__
void forallp_hip_kernel(const LOOP_BODY loop_body,
const Iterator idx,
const IndexType length,
ForallParam f_params)
RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel(const LOOP_BODY loop_body,
const Iterator idx,
const IndexType length,
ForallParam f_params)
{
using RAJA::internal::thread_privatize;
auto privatizer = thread_privatize(loop_body);
Expand Down Expand Up @@ -466,10 +468,11 @@ template<
IterationMapping>::value &&
(IterationGetter::block_size <= 0),
size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
__global__ void forallp_hip_kernel(const LOOP_BODY loop_body,
const Iterator idx,
const IndexType length,
ForallParam f_params)
__global__ RAJA_JIT_COMPILE_ARGS(3) void forallp_hip_kernel(
const LOOP_BODY loop_body,
const Iterator idx,
const IndexType length,
ForallParam f_params)
{
using RAJA::internal::thread_privatize;
auto privatizer = thread_privatize(loop_body);
Expand Down Expand Up @@ -536,7 +539,7 @@ forall_impl(resources::Hip hip_res,
// Only launch kernel if we have something to iterate over
if (len > 0)
{

RAJA::register_lambda(loop_body);
auto func = reinterpret_cast<const void*>(
&impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
camp::decay<ForallParam>>);
Expand Down Expand Up @@ -620,6 +623,7 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
const TypedIndexSet<SegmentTypes...>& iset,
LoopBody&& loop_body)
{
RAJA::register_lambda(loop_body);
int num_seg = iset.getNumSegments();
for (int isi = 0; isi < num_seg; ++isi)
{
Expand Down
7 changes: 4 additions & 3 deletions include/RAJA/policy/hip/kernel/HipKernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

#include "RAJA/util/macros.hpp"
#include "RAJA/util/types.hpp"
#include "RAJA/util/Jit.hpp"

#include "RAJA/pattern/kernel.hpp"
#include "RAJA/pattern/kernel/For.hpp"
Expand Down Expand Up @@ -179,7 +180,7 @@ namespace internal
* HIP global function for launching HipKernel policies
*/
template<typename Data, typename Exec>
__global__ void HipKernelLauncher(const Data data)
__global__ RAJA_JIT_COMPILE void HipKernelLauncher(const Data data)
{

using data_t = camp::decay<Data>;
Expand All @@ -198,7 +199,7 @@ __global__ void HipKernelLauncher(const Data data)
* This launcher is used by the HipKerelFixed policies.
*/
template<int BlockSize, typename Data, typename Exec>
__launch_bounds__(BlockSize, 1) __global__
__launch_bounds__(BlockSize, 1) __global__ RAJA_JIT_COMPILE
void HipKernelLauncherFixed(const Data data)
{

Expand Down Expand Up @@ -639,7 +640,7 @@ struct StatementExecutor<
auto hip_data = RAJA::hip::make_launch_body(
func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
data);

RAJA::register_lambda(func);
//
// Launch the kernel
//
Expand Down
12 changes: 7 additions & 5 deletions include/RAJA/policy/hip/launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@
#include "RAJA/policy/hip/MemUtils_HIP.hpp"
#include "RAJA/policy/hip/raja_hiperrchk.hpp"
#include "RAJA/util/resource.hpp"
#include "RAJA/util/Jit.hpp"

namespace RAJA
{

template<typename BODY, typename ReduceParams>
__global__ void launch_new_reduce_global_fcn(const BODY body_in,
ReduceParams reduce_params)
__global__ RAJA_JIT_COMPILE void launch_new_reduce_global_fcn(
const BODY body_in,
ReduceParams reduce_params)
{
LaunchContext ctx;

Expand Down Expand Up @@ -92,7 +94,7 @@ struct LaunchExecute<
if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
{

RAJA::register_lambda(body_in);
RAJA_FT_BEGIN;

size_t shared_mem_size = launch_params.shared_mem_size;
Expand Down Expand Up @@ -133,7 +135,7 @@ struct LaunchExecute<
};

template<typename BODY, int num_threads, typename ReduceParams>
__launch_bounds__(num_threads, 1) __global__
__launch_bounds__(num_threads, 1) __global__ RAJA_JIT_COMPILE
void launch_new_reduce_global_fcn_fixed(const BODY body_in,
ReduceParams reduce_params)
{
Expand Down Expand Up @@ -210,7 +212,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
launch_info.res = hip_res;

{

RAJA::register_lambda(body_in);
RAJA::expt::ParamMultiplexer::parampack_init(pol, launch_reducers,
launch_info);

Expand Down
34 changes: 34 additions & 0 deletions include/RAJA/util/Jit.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#include "RAJA/config.hpp"
#if defined(RAJA_ENABLE_JIT)
#include "proteus/JitInterface.hpp"
#endif

#ifndef RAJA_jit_HPP
#define RAJA_jit_HPP

namespace RAJA
{
template<typename Lambda>
inline auto register_lambda(Lambda&& lambda)
{
#if defined RAJA_ENABLE_JIT
return proteus::register_lambda(std::forward<Lambda>(lambda));
#else
return std::forward<Lambda>(lambda);
#endif
}

template<typename T>
inline auto jit_variable(T arg)
{
#if defined RAJA_ENABLE_JIT
return proteus::jit_variable(std::forward<T>(arg));
#else
return std::forward<T>(arg);
#endif
}


} // namespace RAJA

#endif
Loading